9 files changed, 2955 insertions, 0 deletions
diff --git a/src/image/jpeg/dct_test.go b/src/image/jpeg/dct_test.go
new file mode 100644
index 000000000..845e75887
--- /dev/null
+++ b/src/image/jpeg/dct_test.go
@@ -0,0 +1,299 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package jpeg
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+	"math/rand"
+	"testing"
+)
+
+func benchmarkDCT(b *testing.B, f func(*block)) {
+	b.StopTimer()
+	blocks := make([]block, 0, b.N*len(testBlocks))
+	for i := 0; i < b.N; i++ {
+		blocks = append(blocks, testBlocks[:]...)
+	}
+	b.StartTimer()
+	for i := range blocks {
+		f(&blocks[i])
+	}
+}
+
+func BenchmarkFDCT(b *testing.B) {
+	benchmarkDCT(b, fdct)
+}
+
+func BenchmarkIDCT(b *testing.B) {
+	benchmarkDCT(b, idct)
+}
+
+func TestDCT(t *testing.T) {
+	blocks := make([]block, len(testBlocks))
+	copy(blocks, testBlocks[:])
+
+	// Append some randomly generated blocks of varying sparseness.
+	r := rand.New(rand.NewSource(123))
+	for i := 0; i < 100; i++ {
+		b := block{}
+		n := r.Int() % 64
+		for j := 0; j < n; j++ {
+			b[r.Int()%len(b)] = r.Int31() % 256
+		}
+		blocks = append(blocks, b)
+	}
+
+	// Check that the FDCT and IDCT functions are inverses, after a scale and
+	// level shift. Scaling reduces the rounding errors in the conversion from
+	// floats to ints.
+	for i, b := range blocks {
+		got, want := b, b
+		for j := range got {
+			got[j] = (got[j] - 128) * 8
+		}
+		slowFDCT(&got)
+		slowIDCT(&got)
+		for j := range got {
+			got[j] = got[j]/8 + 128
+		}
+		if differ(&got, &want) {
+			t.Errorf("i=%d: IDCT(FDCT)\nsrc\n%s\ngot\n%s\nwant\n%s\n", i, &b, &got, &want)
+		}
+	}
+
+	// Check that the optimized and slow FDCT implementations agree.
+	// The fdct function already does a scale and level shift.
+	for i, b := range blocks {
+		got, want := b, b
+		fdct(&got)
+		for j := range want {
+			want[j] = (want[j] - 128) * 8
+		}
+		slowFDCT(&want)
+		if differ(&got, &want) {
+			t.Errorf("i=%d: FDCT\nsrc\n%s\ngot\n%s\nwant\n%s\n", i, &b, &got, &want)
+		}
+	}
+
+	// Check that the optimized and slow IDCT implementations agree.
+	for i, b := range blocks {
+		got, want := b, b
+		idct(&got)
+		slowIDCT(&want)
+		if differ(&got, &want) {
+			t.Errorf("i=%d: IDCT\nsrc\n%s\ngot\n%s\nwant\n%s\n", i, &b, &got, &want)
+		}
+	}
+}
+
+// differ reports whether any pair-wise elements in b0 and b1 differ by 2 or
+// more. That tolerance is because there isn't a single definitive decoding of
+// a given JPEG image, even before the YCbCr to RGB conversion; implementations
+// can have different IDCT rounding errors.
+func differ(b0, b1 *block) bool {
+	for i := range b0 {
+		delta := b0[i] - b1[i]
+		if delta < -2 || +2 < delta {
+			return true
+		}
+	}
+	return false
+}
+
+// alpha returns 1 if i is 0 and returns √2 otherwise.
+func alpha(i int) float64 {
+	if i == 0 {
+		return 1
+	}
+	return math.Sqrt2
+}
+
+var cosines [32]float64 // cosines[k] = cos(π/2 * k/8)
+
+func init() {
+	for k := range cosines {
+		cosines[k] = math.Cos(math.Pi * float64(k) / 16)
+	}
+}
+
+// slowFDCT performs the 8*8 2-dimensional forward discrete cosine transform:
+//
+//	dst[u,v] = (1/8) * Σ_x Σ_y alpha(u) * alpha(v) * src[x,y] *
+//		cos((π/2) * (2*x + 1) * u / 8) *
+//		cos((π/2) * (2*y + 1) * v / 8)
+//
+// x and y are in pixel space, and u and v are in transform space.
+//
+// b acts as both dst and src.
+func slowFDCT(b *block) {
+	var dst [blockSize]float64
+	for v := 0; v < 8; v++ {
+		for u := 0; u < 8; u++ {
+			sum := 0.0
+			for y := 0; y < 8; y++ {
+				for x := 0; x < 8; x++ {
+					sum += alpha(u) * alpha(v) * float64(b[8*y+x]) *
+						cosines[((2*x+1)*u)%32] *
+						cosines[((2*y+1)*v)%32]
+				}
+			}
+			dst[8*v+u] = sum / 8
+		}
+	}
+	// Convert from float64 to int32.
+	for i := range dst {
+		b[i] = int32(dst[i] + 0.5)
+	}
+}
+
+// slowIDCT performs the 8*8 2-dimensional inverse discrete cosine transform:
+//
+//	dst[x,y] = (1/8) * Σ_u Σ_v alpha(u) * alpha(v) * src[u,v] *
+//		cos((π/2) * (2*x + 1) * u / 8) *
+//		cos((π/2) * (2*y + 1) * v / 8)
+//
+// x and y are in pixel space, and u and v are in transform space.
+//
+// b acts as both dst and src.
+func slowIDCT(b *block) {
+	var dst [blockSize]float64
+	for y := 0; y < 8; y++ {
+		for x := 0; x < 8; x++ {
+			sum := 0.0
+			for v := 0; v < 8; v++ {
+				for u := 0; u < 8; u++ {
+					sum += alpha(u) * alpha(v) * float64(b[8*v+u]) *
+						cosines[((2*x+1)*u)%32] *
+						cosines[((2*y+1)*v)%32]
+				}
+			}
+			dst[8*y+x] = sum / 8
+		}
+	}
+	// Convert from float64 to int32.
+	for i := range dst {
+		b[i] = int32(dst[i] + 0.5)
+	}
+}
+
+func (b *block) String() string {
+	s := bytes.NewBuffer(nil)
+	fmt.Fprintf(s, "{\n")
+	for y := 0; y < 8; y++ {
+		fmt.Fprintf(s, "\t")
+		for x := 0; x < 8; x++ {
+			fmt.Fprintf(s, "0x%04x, ", uint16(b[8*y+x]))
+		}
+		fmt.Fprintln(s)
+	}
+	fmt.Fprintf(s, "}")
+	return s.String()
+}
+
+// testBlocks are the first 10 pre-IDCT blocks from ../testdata/video-001.jpeg.
+var testBlocks = [10]block{
+	{
+		0x7f, 0xf6, 0x01, 0x07, 0xff, 0x00, 0x00, 0x00,
+		0xf5, 0x01, 0xfa, 0x01, 0xfe, 0x00, 0x01, 0x00,
+		0x05, 0x05, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0xff, 0xf8, 0x00, 0x01, 0xff, 0x00, 0x00,
+		0x00, 0x01, 0x00, 0x01, 0x00, 0xff, 0xff, 0x00,
+		0xff, 0x0c, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01,
+		0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x00, 0x01, 0xff, 0x01, 0x00, 0xfe,
+	},
+	{
+		0x29, 0x07, 0x00, 0xfc, 0x01, 0x01, 0x00, 0x00,
+		0x07, 0x00, 0x03, 0x00, 0x01, 0x00, 0xff, 0xff,
+		0xff, 0xfd, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x04, 0x00, 0xff, 0x01, 0x00, 0x00,
+		0x01, 0x00, 0x01, 0xff, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0xfa, 0x01, 0x00, 0x01, 0x00, 0x01, 0xff,
+		0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0xff, 0x00, 0xff, 0x00, 0x02,
+	},
+	{
+		0xc5, 0xfa, 0x01, 0x00, 0x00, 0x01, 0x00, 0xff,
+		0x02, 0xff, 0x01, 0x00, 0x01, 0x00, 0xff, 0x00,
+		0xff, 0xff, 0x00, 0xff, 0x01, 0x00, 0x00, 0x00,
+		0xff, 0x00, 0x01, 0x00, 0x00, 0x00, 0xff, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
+		0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	},
+	{
+		0x86, 0x05, 0x00, 0x02, 0x00, 0x00, 0x01, 0x00,
+		0xf2, 0x06, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00,
+		0xf6, 0xfa, 0xf9, 0x00, 0xff, 0x01, 0x00, 0x00,
+		0xf9, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00, 0x00,
+		0xff, 0x00, 0x00, 0x01, 0x00, 0xff, 0x01, 0x00,
+		0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x01,
+		0x00, 0x01, 0xff, 0x01, 0x00, 0xff, 0x00, 0x00,
+	},
+	{
+		0x24, 0xfe, 0x00, 0xff, 0x00, 0xff, 0xff, 0x00,
+		0x08, 0xfd, 0x00, 0x01, 0x01, 0x00, 0x01, 0x00,
+		0x06, 0x03, 0x03, 0xff, 0x00, 0x00, 0x00, 0x00,
+		0x04, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
+		0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01,
+		0x01, 0x00, 0x01, 0xff, 0x00, 0x01, 0x00, 0x00,
+		0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0xff, 0x01,
+	},
+	{
+		0xcd, 0xff, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+		0x03, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00,
+		0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0xff,
+	},
+	{
+		0x81, 0xfe, 0x05, 0xff, 0x01, 0xff, 0x01, 0x00,
+		0xef, 0xf9, 0x00, 0xf9, 0x00, 0xff, 0x00, 0xff,
+		0x05, 0xf9, 0x00, 0xf8, 0x01, 0xff, 0x01, 0xff,
+		0x00, 0xff, 0x07, 0x00, 0x01, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x01,
+		0xff, 0x01, 0x01, 0x00, 0xff, 0x00, 0x00, 0x00,
+		0x01, 0x01, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
+	},
+	{
+		0x28, 0x00, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x0b, 0x02, 0x01, 0x03, 0x00, 0xff, 0x00, 0x01,
+		0xfe, 0x02, 0x01, 0x03, 0xff, 0x00, 0x00, 0x00,
+		0x01, 0x00, 0xfd, 0x00, 0x01, 0x00, 0xff, 0x00,
+		0x01, 0xff, 0x00, 0xff, 0x01, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0xff, 0x01, 0x01, 0x00, 0xff,
+		0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0xff, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x01,
+	},
+	{
+		0xdf, 0xf9, 0xfe, 0x00, 0x03, 0x01, 0xff, 0xff,
+		0x04, 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
+		0xff, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01,
+		0x00, 0x00, 0xfe, 0x01, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0xff, 0x01, 0x00, 0x00, 0x00, 0x01,
+		0xff, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+		0x00, 0xff, 0x00, 0xff, 0x01, 0x00, 0x00, 0x01,
+		0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+	},
+	{
+		0x88, 0xfd, 0x00, 0x00, 0xff, 0x00, 0x01, 0xff,
+		0xe1, 0x06, 0x06, 0x01, 0xff, 0x00, 0x01, 0x00,
+		0x08, 0x00, 0xfa, 0x00, 0xff, 0xff, 0xff, 0xff,
+		0x08, 0x01, 0x00, 0xff, 0x01, 0xff, 0x00, 0x00,
+		0xf5, 0xff, 0x00, 0x01, 0xff, 0x01, 0x01, 0x00,
+		0xff, 0xff, 0x01, 0xff, 0x01, 0x00, 0x01, 0x00,
+		0x00, 0x01, 0x01, 0xff, 0x00, 0xff, 0x00, 0x01,
+		0x02, 0x00, 0x00, 0xff, 0xff, 0x00, 0xff, 0x00,
+	},
+}
diff --git a/src/image/jpeg/fdct.go b/src/image/jpeg/fdct.go
new file mode 100644
index 000000000..3f8be4e32
--- /dev/null
+++ b/src/image/jpeg/fdct.go
@@ -0,0 +1,190 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package jpeg
+
+// This file implements a Forward Discrete Cosine Transformation.
+
+/*
+It is based on the code in jfdctint.c from the Independent JPEG Group,
+found at http://www.ijg.org/files/jpegsrc.v8c.tar.gz.
+
+The "LEGAL ISSUES" section of the README in that archive says:
+
+In plain English:
+
+1. We don't promise that this software works.  (But if you find any bugs,
+   please let us know!)
+2. You can use this software for whatever you want.  You don't have to pay us.
+3. You may not pretend that you wrote this software.  If you use it in a
+   program, you must acknowledge somewhere in your documentation that
+   you've used the IJG code.
+
+In legalese:
+
+The authors make NO WARRANTY or representation, either express or implied,
+with respect to this software, its quality, accuracy, merchantability, or
+fitness for a particular purpose.  This software is provided "AS IS", and you,
+its user, assume the entire risk as to its quality and accuracy.
+
+This software is copyright (C) 1991-2011, Thomas G. Lane, Guido Vollbeding.
+All Rights Reserved except as specified below.
+
+Permission is hereby granted to use, copy, modify, and distribute this
+software (or portions thereof) for any purpose, without fee, subject to these
+conditions:
+(1) If any part of the source code for this software is distributed, then this
+README file must be included, with this copyright and no-warranty notice
+unaltered; and any additions, deletions, or changes to the original files
+must be clearly indicated in accompanying documentation.
+(2) If only executable code is distributed, then the accompanying
+documentation must state that "this software is based in part on the work of
+the Independent JPEG Group".
+(3) Permission for use of this software is granted only if the user accepts
+full responsibility for any undesirable consequences; the authors accept
+NO LIABILITY for damages of any kind.
+
+These conditions apply to any software derived from or based on the IJG code,
+not just to the unmodified library.  If you use our work, you ought to
+acknowledge us.
+
+Permission is NOT granted for the use of any IJG author's name or company name
+in advertising or publicity relating to this software or products derived from
+it.  This software may be referred to only as "the Independent JPEG Group's
+software".
+
+We specifically permit and encourage the use of this software as the basis of
+commercial products, provided that all warranty or liability claims are
+assumed by the product vendor.
+*/
+
+// Trigonometric constants in 13-bit fixed point format.
+const (
+	fix_0_298631336 = 2446
+	fix_0_390180644 = 3196
+	fix_0_541196100 = 4433
+	fix_0_765366865 = 6270
+	fix_0_899976223 = 7373
+	fix_1_175875602 = 9633
+	fix_1_501321110 = 12299
+	fix_1_847759065 = 15137
+	fix_1_961570560 = 16069
+	fix_2_053119869 = 16819
+	fix_2_562915447 = 20995
+	fix_3_072711026 = 25172
+)
+
+const (
+	constBits     = 13
+	pass1Bits     = 2
+	centerJSample = 128
+)
+
+// fdct performs a forward DCT on an 8x8 block of coefficients, including a
+// level shift.
+func fdct(b *block) {
+	// Pass 1: process rows.
+	for y := 0; y < 8; y++ {
+		x0 := b[y*8+0]
+		x1 := b[y*8+1]
+		x2 := b[y*8+2]
+		x3 := b[y*8+3]
+		x4 := b[y*8+4]
+		x5 := b[y*8+5]
+		x6 := b[y*8+6]
+		x7 := b[y*8+7]
+
+		tmp0 := x0 + x7
+		tmp1 := x1 + x6
+		tmp2 := x2 + x5
+		tmp3 := x3 + x4
+
+		tmp10 := tmp0 + tmp3
+		tmp12 := tmp0 - tmp3
+		tmp11 := tmp1 + tmp2
+		tmp13 := tmp1 - tmp2
+
+		tmp0 = x0 - x7
+		tmp1 = x1 - x6
+		tmp2 = x2 - x5
+		tmp3 = x3 - x4
+
+		b[y*8+0] = (tmp10 + tmp11 - 8*centerJSample) << pass1Bits
+		b[y*8+4] = (tmp10 - tmp11) << pass1Bits
+		z1 := (tmp12 + tmp13) * fix_0_541196100
+		z1 += 1 << (constBits - pass1Bits - 1)
+		b[y*8+2] = (z1 + tmp12*fix_0_765366865) >> (constBits - pass1Bits)
+		b[y*8+6] = (z1 - tmp13*fix_1_847759065) >> (constBits - pass1Bits)
+
+		tmp10 = tmp0 + tmp3
+		tmp11 = tmp1 + tmp2
+		tmp12 = tmp0 + tmp2
+		tmp13 = tmp1 + tmp3
+		z1 = (tmp12 + tmp13) * fix_1_175875602
+		z1 += 1 << (constBits - pass1Bits - 1)
+		tmp0 = tmp0 * fix_1_501321110
+		tmp1 = tmp1 * fix_3_072711026
+		tmp2 = tmp2 * fix_2_053119869
+		tmp3 = tmp3 * fix_0_298631336
+		tmp10 = tmp10 * -fix_0_899976223
+		tmp11 = tmp11 * -fix_2_562915447
+		tmp12 = tmp12 * -fix_0_390180644
+		tmp13 = tmp13 * -fix_1_961570560
+
+		tmp12 += z1
+		tmp13 += z1
+		b[y*8+1] = (tmp0 + tmp10 + tmp12) >> (constBits - pass1Bits)
+		b[y*8+3] = (tmp1 + tmp11 + tmp13) >> (constBits - pass1Bits)
+		b[y*8+5] = (tmp2 + tmp11 + tmp12) >> (constBits - pass1Bits)
+		b[y*8+7] = (tmp3 + tmp10 + tmp13) >> (constBits - pass1Bits)
+	}
+	// Pass 2: process columns.
+	// We remove pass1Bits scaling, but leave results scaled up by an overall factor of 8.
+	for x := 0; x < 8; x++ {
+		tmp0 := b[0*8+x] + b[7*8+x]
+		tmp1 := b[1*8+x] + b[6*8+x]
+		tmp2 := b[2*8+x] + b[5*8+x]
+		tmp3 := b[3*8+x] + b[4*8+x]
+
+		tmp10 := tmp0 + tmp3 + 1<<(pass1Bits-1)
+		tmp12 := tmp0 - tmp3
+		tmp11 := tmp1 + tmp2
+		tmp13 := tmp1 - tmp2
+
+		tmp0 = b[0*8+x] - b[7*8+x]
+		tmp1 = b[1*8+x] - b[6*8+x]
+		tmp2 = b[2*8+x] - b[5*8+x]
+		tmp3 = b[3*8+x] - b[4*8+x]
+
+		b[0*8+x] = (tmp10 + tmp11) >> pass1Bits
+		b[4*8+x] = (tmp10 - tmp11) >> pass1Bits
+
+		z1 := (tmp12 + tmp13) * fix_0_541196100
+		z1 += 1 << (constBits + pass1Bits - 1)
+		b[2*8+x] = (z1 + tmp12*fix_0_765366865) >> (constBits + pass1Bits)
+		b[6*8+x] = (z1 - tmp13*fix_1_847759065) >> (constBits + pass1Bits)
+
+		tmp10 = tmp0 + tmp3
+		tmp11 = tmp1 + tmp2
+		tmp12 = tmp0 + tmp2
+		tmp13 = tmp1 + tmp3
+		z1 = (tmp12 + tmp13) * fix_1_175875602
+		z1 += 1 << (constBits + pass1Bits - 1)
+		tmp0 = tmp0 * fix_1_501321110
+		tmp1 = tmp1 * fix_3_072711026
+		tmp2 = tmp2 * fix_2_053119869
+		tmp3 = tmp3 * fix_0_298631336
+		tmp10 = tmp10 * -fix_0_899976223
+		tmp11 = tmp11 * -fix_2_562915447
+		tmp12 = tmp12 * -fix_0_390180644
+		tmp13 = tmp13 * -fix_1_961570560
+
+		tmp12 += z1
+		tmp13 += z1
+		b[1*8+x] = (tmp0 + tmp10 + tmp12) >> (constBits + pass1Bits)
+		b[3*8+x] = (tmp1 + tmp11 + tmp13) >> (constBits + pass1Bits)
+		b[5*8+x] = (tmp2 + tmp11 + tmp12) >> (constBits + pass1Bits)
+		b[7*8+x] = (tmp3 + tmp10 + tmp13) >> (constBits + pass1Bits)
+	}
+}
diff --git a/src/image/jpeg/huffman.go b/src/image/jpeg/huffman.go
new file mode 100644
index 000000000..d4ff4cfa0
--- /dev/null
+++ b/src/image/jpeg/huffman.go
@@ -0,0 +1,244 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package jpeg
+
+import (
+	"io"
+)
+
+// maxCodeLength is the maximum (inclusive) number of bits in a Huffman code.
+const maxCodeLength = 16
+
+// maxNCodes is the maximum (inclusive) number of codes in a Huffman tree.
+const maxNCodes = 256
+
+// lutSize is the log-2 size of the Huffman decoder's look-up table.
+const lutSize = 8
+
+// huffman is a Huffman decoder, specified in section C.
+type huffman struct {
+	// length is the number of codes in the tree.
+	nCodes int32
+	// lut is the look-up table for the next lutSize bits in the bit-stream.
+	// The high 8 bits of the uint16 are the encoded value. The low 8 bits
+	// are 1 plus the code length, or 0 if the value is too large to fit in
+	// lutSize bits.
+	lut [1 << lutSize]uint16
+	// vals are the decoded values, sorted by their encoding.
+	vals [maxNCodes]uint8
+	// minCodes[i] is the minimum code of length i, or -1 if there are no
+	// codes of that length.
+	minCodes [maxCodeLength]int32
+	// maxCodes[i] is the maximum code of length i, or -1 if there are no
+	// codes of that length.
+	maxCodes [maxCodeLength]int32
+	// valsIndices[i] is the index into vals of minCodes[i].
+	valsIndices [maxCodeLength]int32
+}
+
+// errShortHuffmanData means that an unexpected EOF occurred while decoding
+// Huffman data.
+var errShortHuffmanData = FormatError("short Huffman data")
+
+// ensureNBits reads bytes from the byte buffer to ensure that d.bits.n is at
+// least n. For best performance (avoiding function calls inside hot loops),
+// the caller is the one responsible for first checking that d.bits.n < n.
+func (d *decoder) ensureNBits(n int32) error {
+	for {
+		c, err := d.readByteStuffedByte()
+		if err != nil {
+			if err == io.EOF {
+				return errShortHuffmanData
+			}
+			return err
+		}
+		d.bits.a = d.bits.a<<8 | uint32(c)
+		d.bits.n += 8
+		if d.bits.m == 0 {
+			d.bits.m = 1 << 7
+		} else {
+			d.bits.m <<= 8
+		}
+		if d.bits.n >= n {
+			break
+		}
+	}
+	return nil
+}
+
+// receiveExtend is the composition of RECEIVE and EXTEND, specified in section
+// F.2.2.1.
+func (d *decoder) receiveExtend(t uint8) (int32, error) {
+	if d.bits.n < int32(t) {
+		if err := d.ensureNBits(int32(t)); err != nil {
+			return 0, err
+		}
+	}
+	d.bits.n -= int32(t)
+	d.bits.m >>= t
+	s := int32(1) << t
+	x := int32(d.bits.a>>uint8(d.bits.n)) & (s - 1)
+	if x < s>>1 {
+		x += ((-1) << t) + 1
+	}
+	return x, nil
+}
+
+// processDHT processes a Define Huffman Table marker, and initializes a huffman
+// struct from its contents. Specified in section B.2.4.2.
+func (d *decoder) processDHT(n int) error {
+	for n > 0 {
+		if n < 17 {
+			return FormatError("DHT has wrong length")
+		}
+		if err := d.readFull(d.tmp[:17]); err != nil {
+			return err
+		}
+		tc := d.tmp[0] >> 4
+		if tc > maxTc {
+			return FormatError("bad Tc value")
+		}
+		th := d.tmp[0] & 0x0f
+		if th > maxTh || !d.progressive && th > 1 {
+			return FormatError("bad Th value")
+		}
+		h := &d.huff[tc][th]
+
+		// Read nCodes and h.vals (and derive h.nCodes).
+		// nCodes[i] is the number of codes with code length i.
+		// h.nCodes is the total number of codes.
+		h.nCodes = 0
+		var nCodes [maxCodeLength]int32
+		for i := range nCodes {
+			nCodes[i] = int32(d.tmp[i+1])
+			h.nCodes += nCodes[i]
+		}
+		if h.nCodes == 0 {
+			return FormatError("Huffman table has zero length")
+		}
+		if h.nCodes > maxNCodes {
+			return FormatError("Huffman table has excessive length")
+		}
+		n -= int(h.nCodes) + 17
+		if n < 0 {
+			return FormatError("DHT has wrong length")
+		}
+		if err := d.readFull(h.vals[:h.nCodes]); err != nil {
+			return err
+		}
+
+		// Derive the look-up table.
+		for i := range h.lut {
+			h.lut[i] = 0
+		}
+		var x, code uint32
+		for i := uint32(0); i < lutSize; i++ {
+			code <<= 1
+			for j := int32(0); j < nCodes[i]; j++ {
+				// The codeLength is 1+i, so shift code by 8-(1+i) to
+				// calculate the high bits for every 8-bit sequence
+				// whose codeLength's high bits matches code.
+				// The high 8 bits of lutValue are the encoded value.
+				// The low 8 bits are 1 plus the codeLength.
+				base := uint8(code << (7 - i))
+				lutValue := uint16(h.vals[x])<<8 | uint16(2+i)
+				for k := uint8(0); k < 1<<(7-i); k++ {
+					h.lut[base|k] = lutValue
+				}
+				code++
+				x++
+			}
+		}
+
+		// Derive minCodes, maxCodes, and valsIndices.
+		var c, index int32
+		for i, n := range nCodes {
+			if n == 0 {
+				h.minCodes[i] = -1
+				h.maxCodes[i] = -1
+				h.valsIndices[i] = -1
+			} else {
+				h.minCodes[i] = c
+				h.maxCodes[i] = c + n - 1
+				h.valsIndices[i] = index
+				c += n
+				index += n
+			}
+			c <<= 1
+		}
+	}
+	return nil
+}
+
+// decodeHuffman returns the next Huffman-coded value from the bit-stream,
+// decoded according to h.
+func (d *decoder) decodeHuffman(h *huffman) (uint8, error) {
+	if h.nCodes == 0 {
+		return 0, FormatError("uninitialized Huffman table")
+	}
+
+	if d.bits.n < 8 {
+		if err := d.ensureNBits(8); err != nil {
+			if err != errMissingFF00 && err != errShortHuffmanData {
+				return 0, err
+			}
+			// There are no more bytes of data in this segment, but we may still
+			// be able to read the next symbol out of the previously read bits.
+			// First, undo the readByte that the ensureNBits call made.
+			d.unreadByteStuffedByte()
+			goto slowPath
+		}
+	}
+	if v := h.lut[(d.bits.a>>uint32(d.bits.n-lutSize))&0xff]; v != 0 {
+		n := (v & 0xff) - 1
+		d.bits.n -= int32(n)
+		d.bits.m >>= n
+		return uint8(v >> 8), nil
+	}
+
+slowPath:
+	for i, code := 0, int32(0); i < maxCodeLength; i++ {
+		if d.bits.n == 0 {
+			if err := d.ensureNBits(1); err != nil {
+				return 0, err
+			}
+		}
+		if d.bits.a&d.bits.m != 0 {
+			code |= 1
+		}
+		d.bits.n--
+		d.bits.m >>= 1
+		if code <= h.maxCodes[i] {
+			return h.vals[h.valsIndices[i]+code-h.minCodes[i]], nil
+		}
+		code <<= 1
+	}
+	return 0, FormatError("bad Huffman code")
+}
+
+func (d *decoder) decodeBit() (bool, error) {
+	if d.bits.n == 0 {
+		if err := d.ensureNBits(1); err != nil {
+			return false, err
+		}
+	}
+	ret := d.bits.a&d.bits.m != 0
+	d.bits.n--
+	d.bits.m >>= 1
+	return ret, nil
+}
+
+func (d *decoder) decodeBits(n int32) (uint32, error) {
+	if d.bits.n < n {
+		if err := d.ensureNBits(n); err != nil {
+			return 0, err
+		}
+	}
+	ret := d.bits.a >> uint32(d.bits.n-n)
+	ret &= (1 << uint32(n)) - 1
+	d.bits.n -= n
+	d.bits.m >>= uint32(n)
+	return ret, nil
+}
diff --git a/src/image/jpeg/idct.go b/src/image/jpeg/idct.go
new file mode 100644
index 000000000..46fcaecb7
--- /dev/null
+++ b/src/image/jpeg/idct.go
@@ -0,0 +1,192 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package jpeg
+
+// This is a Go translation of idct.c from
+//
+// http://standards.iso.org/ittf/PubliclyAvailableStandards/ISO_IEC_13818-4_2004_Conformance_Testing/Video/verifier/mpeg2decode_960109.tar.gz
+//
+// which carries the following notice:
+
+/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
+
+/*
+ * Disclaimer of Warranty
+ *
+ * These software programs are available to the user without any license fee or
+ * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
+ * any and all warranties, whether express, implied, or statuary, including any
+ * implied warranties or merchantability or of fitness for a particular
+ * purpose.  In no event shall the copyright-holder be liable for any
+ * incidental, punitive, or consequential damages of any kind whatsoever
+ * arising from the use of these programs.
+ *
+ * This disclaimer of warranty extends to the user of these programs and user's
+ * customers, employees, agents, transferees, successors, and assigns.
+ *
+ * The MPEG Software Simulation Group does not represent or warrant that the
+ * programs furnished hereunder are free of infringement of any third-party
+ * patents.
+ *
+ * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
+ * are subject to royalty fees to patent holders.  Many of these patents are
+ * general enough such that they are unavoidable regardless of implementation
+ * design.
+ *
+ */
+
+const blockSize = 64 // A DCT block is 8x8.
+
+type block [blockSize]int32
+
+const (
+	w1 = 2841 // 2048*sqrt(2)*cos(1*pi/16)
+	w2 = 2676 // 2048*sqrt(2)*cos(2*pi/16)
+	w3 = 2408 // 2048*sqrt(2)*cos(3*pi/16)
+	w5 = 1609 // 2048*sqrt(2)*cos(5*pi/16)
+	w6 = 1108 // 2048*sqrt(2)*cos(6*pi/16)
+	w7 = 565  // 2048*sqrt(2)*cos(7*pi/16)
+
+	w1pw7 = w1 + w7
+	w1mw7 = w1 - w7
+	w2pw6 = w2 + w6
+	w2mw6 = w2 - w6
+	w3pw5 = w3 + w5
+	w3mw5 = w3 - w5
+
+	r2 = 181 // 256/sqrt(2)
+)
+
+// idct performs a 2-D Inverse Discrete Cosine Transformation.
+//
+// The input coefficients should already have been multiplied by the
+// appropriate quantization table. We use fixed-point computation, with the
+// number of bits for the fractional component varying over the intermediate
+// stages.
+//
+// For more on the actual algorithm, see Z. Wang, "Fast algorithms for the
+// discrete W transform and for the discrete Fourier transform", IEEE Trans. on
+// ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
+func idct(src *block) {
+	// Horizontal 1-D IDCT.
+	for y := 0; y < 8; y++ {
+		y8 := y * 8
+		// If all the AC components are zero, then the IDCT is trivial.
+		if src[y8+1] == 0 && src[y8+2] == 0 && src[y8+3] == 0 &&
+			src[y8+4] == 0 && src[y8+5] == 0 && src[y8+6] == 0 && src[y8+7] == 0 {
+			dc := src[y8+0] << 3
+			src[y8+0] = dc
+			src[y8+1] = dc
+			src[y8+2] = dc
+			src[y8+3] = dc
+			src[y8+4] = dc
+			src[y8+5] = dc
+			src[y8+6] = dc
+			src[y8+7] = dc
+			continue
+		}
+
+		// Prescale.
+		x0 := (src[y8+0] << 11) + 128
+		x1 := src[y8+4] << 11
+		x2 := src[y8+6]
+		x3 := src[y8+2]
+		x4 := src[y8+1]
+		x5 := src[y8+7]
+		x6 := src[y8+5]
+		x7 := src[y8+3]
+
+		// Stage 1.
+		x8 := w7 * (x4 + x5)
+		x4 = x8 + w1mw7*x4
+		x5 = x8 - w1pw7*x5
+		x8 = w3 * (x6 + x7)
+		x6 = x8 - w3mw5*x6
+		x7 = x8 - w3pw5*x7
+
+		// Stage 2.
+		x8 = x0 + x1
+		x0 -= x1
+		x1 = w6 * (x3 + x2)
+		x2 = x1 - w2pw6*x2
+		x3 = x1 + w2mw6*x3
+		x1 = x4 + x6
+		x4 -= x6
+		x6 = x5 + x7
+		x5 -= x7
+
+		// Stage 3.
+		x7 = x8 + x3
+		x8 -= x3
+		x3 = x0 + x2
+		x0 -= x2
+		x2 = (r2*(x4+x5) + 128) >> 8
+		x4 = (r2*(x4-x5) + 128) >> 8
+
+		// Stage 4.
+		src[y8+0] = (x7 + x1) >> 8
+		src[y8+1] = (x3 + x2) >> 8
+		src[y8+2] = (x0 + x4) >> 8
+		src[y8+3] = (x8 + x6) >> 8
+		src[y8+4] = (x8 - x6) >> 8
+		src[y8+5] = (x0 - x4) >> 8
+		src[y8+6] = (x3 - x2) >> 8
+		src[y8+7] = (x7 - x1) >> 8
+	}
+
+	// Vertical 1-D IDCT.
+	for x := 0; x < 8; x++ {
+		// Similar to the horizontal 1-D IDCT case, if all the AC components are zero, then the IDCT is trivial.
+		// However, after performing the horizontal 1-D IDCT, there are typically non-zero AC components, so
+		// we do not bother to check for the all-zero case.
+
+		// Prescale.
+		y0 := (src[8*0+x] << 8) + 8192
+		y1 := src[8*4+x] << 8
+		y2 := src[8*6+x]
+		y3 := src[8*2+x]
+		y4 := src[8*1+x]
+		y5 := src[8*7+x]
+		y6 := src[8*5+x]
+		y7 := src[8*3+x]
+
+		// Stage 1.
+		y8 := w7*(y4+y5) + 4
+		y4 = (y8 + w1mw7*y4) >> 3
+		y5 = (y8 - w1pw7*y5) >> 3
+		y8 = w3*(y6+y7) + 4
+		y6 = (y8 - w3mw5*y6) >> 3
+		y7 = (y8 - w3pw5*y7) >> 3
+
+		// Stage 2.
+		y8 = y0 + y1
+		y0 -= y1
+		y1 = w6*(y3+y2) + 4
+		y2 = (y1 - w2pw6*y2) >> 3
+		y3 = (y1 + w2mw6*y3) >> 3
+		y1 = y4 + y6
+		y4 -= y6
+		y6 = y5 + y7
+		y5 -= y7
+
+		// Stage 3.
+		y7 = y8 + y3
+		y8 -= y3
+		y3 = y0 + y2
+		y0 -= y2
+		y2 = (r2*(y4+y5) + 128) >> 8
+		y4 = (r2*(y4-y5) + 128) >> 8
+
+		// Stage 4.
+		src[8*0+x] = (y7 + y1) >> 14
+		src[8*1+x] = (y3 + y2) >> 14
+		src[8*2+x] = (y0 + y4) >> 14
+		src[8*3+x] = (y8 + y6) >> 14
+		src[8*4+x] = (y8 - y6) >> 14
+		src[8*5+x] = (y0 - y4) >> 14
+		src[8*6+x] = (y3 - y2) >> 14
+		src[8*7+x] = (y7 - y1) >> 14
+	}
+}
diff --git a/src/image/jpeg/reader.go b/src/image/jpeg/reader.go
new file mode 100644
index 000000000..c8fae3cea
--- /dev/null
+++ b/src/image/jpeg/reader.go
@@ -0,0 +1,524 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package jpeg implements a JPEG image decoder and encoder.
+//
+// JPEG is defined in ITU-T T.81: http://www.w3.org/Graphics/JPEG/itu-t81.pdf.
+package jpeg
+
+import (
+	"image"
+	"image/color"
+	"io"
+)
+
+// TODO(nigeltao): fix up the doc comment style so that sentences start with
+// the name of the type or function that they annotate.
+
+// A FormatError reports that the input is not a valid JPEG.
+type FormatError string
+
+func (e FormatError) Error() string { return "invalid JPEG format: " + string(e) }
+
+// An UnsupportedError reports that the input uses a valid but unimplemented JPEG feature.
+type UnsupportedError string
+
+func (e UnsupportedError) Error() string { return "unsupported JPEG feature: " + string(e) }
+
+// Component specification, specified in section B.2.2.
+type component struct {
+	h  int   // Horizontal sampling factor.
+	v  int   // Vertical sampling factor.
+	c  uint8 // Component identifier.
+	tq uint8 // Quantization table destination selector.
+}
+
+const (
+	dcTable = 0
+	acTable = 1
+	maxTc   = 1
+	maxTh   = 3
+	maxTq   = 3
+
+	// A grayscale JPEG image has only a Y component.
+	nGrayComponent = 1
+	// A color JPEG image has Y, Cb and Cr components.
+	nColorComponent = 3
+
+	// We only support 4:4:4, 4:4:0, 4:2:2 and 4:2:0 downsampling, and therefore the
+	// number of luma samples per chroma sample is at most 2 in the horizontal
+	// and 2 in the vertical direction.
+	maxH = 2
+	maxV = 2
+)
+
+const (
+	soiMarker   = 0xd8 // Start Of Image.
+	eoiMarker   = 0xd9 // End Of Image.
+	sof0Marker  = 0xc0 // Start Of Frame (Baseline).
+	sof2Marker  = 0xc2 // Start Of Frame (Progressive).
+	dhtMarker   = 0xc4 // Define Huffman Table.
+	dqtMarker   = 0xdb // Define Quantization Table.
+	sosMarker   = 0xda // Start Of Scan.
+	driMarker   = 0xdd // Define Restart Interval.
+	rst0Marker  = 0xd0 // ReSTart (0).
+	rst7Marker  = 0xd7 // ReSTart (7).
+	app0Marker  = 0xe0 // APPlication specific (0).
+	app15Marker = 0xef // APPlication specific (15).
+	comMarker   = 0xfe // COMment.
+)
+
+// unzig maps from the zig-zag ordering to the natural ordering. For example,
+// unzig[3] is the column and row of the fourth element in zig-zag order. The
+// value is 16, which means first column (16%8 == 0) and third row (16/8 == 2).
+var unzig = [blockSize]int{
+	0, 1, 8, 16, 9, 2, 3, 10,
+	17, 24, 32, 25, 18, 11, 4, 5,
+	12, 19, 26, 33, 40, 48, 41, 34,
+	27, 20, 13, 6, 7, 14, 21, 28,
+	35, 42, 49, 56, 57, 50, 43, 36,
+	29, 22, 15, 23, 30, 37, 44, 51,
+	58, 59, 52, 45, 38, 31, 39, 46,
+	53, 60, 61, 54, 47, 55, 62, 63,
+}
+
+// Reader is deprecated.
+type Reader interface {
+	io.ByteReader
+	io.Reader
+}
+
+// bits holds the unprocessed bits that have been taken from the byte-stream.
+// The n least significant bits of a form the unread bits, to be read in MSB to
+// LSB order.
+type bits struct {
+	a uint32 // accumulator.
+	m uint32 // mask. m==1<<(n-1) when n>0, with m==0 when n==0.
+	n int32  // the number of unread bits in a.
+}
+
+type decoder struct {
+	r    io.Reader
+	bits bits
+	// bytes is a byte buffer, similar to a bufio.Reader, except that it
+	// has to be able to unread more than 1 byte, due to byte stuffing.
+	// Byte stuffing is specified in section F.1.2.3.
+	bytes struct {
+		// buf[i:j] are the buffered bytes read from the underlying
+		// io.Reader that haven't yet been passed further on.
+		buf  [4096]byte
+		i, j int
+		// nUnreadable is the number of bytes to back up i after
+		// overshooting. It can be 0, 1 or 2.
+		nUnreadable int
+	}
+	width, height int
+	img1          *image.Gray
+	img3          *image.YCbCr
+	ri            int // Restart Interval.
+	nComp         int
+	progressive   bool
+	eobRun        uint16 // End-of-Band run, specified in section G.1.2.2.
+	comp          [nColorComponent]component
+	progCoeffs    [nColorComponent][]block // Saved state between progressive-mode scans.
+	huff          [maxTc + 1][maxTh + 1]huffman
+	quant         [maxTq + 1]block // Quantization tables, in zig-zag order.
+	tmp           [blockSize + 1]byte
+}
+
+// fill fills up the d.bytes.buf buffer from the underlying io.Reader. It
+// should only be called when there are no unread bytes in d.bytes.
+func (d *decoder) fill() error {
+	if d.bytes.i != d.bytes.j {
+		panic("jpeg: fill called when unread bytes exist")
+	}
+	// Move the last 2 bytes to the start of the buffer, in case we need
+	// to call unreadByteStuffedByte.
+	if d.bytes.j > 2 {
+		d.bytes.buf[0] = d.bytes.buf[d.bytes.j-2]
+		d.bytes.buf[1] = d.bytes.buf[d.bytes.j-1]
+		d.bytes.i, d.bytes.j = 2, 2
+	}
+	// Fill in the rest of the buffer.
+	n, err := d.r.Read(d.bytes.buf[d.bytes.j:])
+	d.bytes.j += n
+	return err
+}
+
+// unreadByteStuffedByte undoes the most recent readByteStuffedByte call,
+// giving a byte of data back from d.bits to d.bytes. The Huffman look-up table
+// requires at least 8 bits for look-up, which means that Huffman decoding can
+// sometimes overshoot and read one or two too many bytes. Two-byte overshoot
+// can happen when expecting to read a 0xff 0x00 byte-stuffed byte.
+func (d *decoder) unreadByteStuffedByte() {
+	if d.bytes.nUnreadable == 0 {
+		panic("jpeg: unreadByteStuffedByte call cannot be fulfilled")
+	}
+	d.bytes.i -= d.bytes.nUnreadable
+	d.bytes.nUnreadable = 0
+	if d.bits.n >= 8 {
+		d.bits.a >>= 8
+		d.bits.n -= 8
+		d.bits.m >>= 8
+	}
+}
+
+// readByte returns the next byte, whether buffered or not buffered. It does
+// not care about byte stuffing.
+func (d *decoder) readByte() (x byte, err error) {
+	for d.bytes.i == d.bytes.j {
+		if err = d.fill(); err != nil {
+			return 0, err
+		}
+	}
+	x = d.bytes.buf[d.bytes.i]
+	d.bytes.i++
+	d.bytes.nUnreadable = 0
+	return x, nil
+}
+
+// errMissingFF00 means that readByteStuffedByte encountered an 0xff byte (a
+// marker byte) that wasn't the expected byte-stuffed sequence 0xff, 0x00.
+var errMissingFF00 = FormatError("missing 0xff00 sequence")
+
+// readByteStuffedByte is like readByte but is for byte-stuffed Huffman data.
+func (d *decoder) readByteStuffedByte() (x byte, err error) {
+	// Take the fast path if d.bytes.buf contains at least two bytes.
+	if d.bytes.i+2 <= d.bytes.j {
+		x = d.bytes.buf[d.bytes.i]
+		d.bytes.i++
+		d.bytes.nUnreadable = 1
+		if x != 0xff {
+			return x, err
+		}
+		if d.bytes.buf[d.bytes.i] != 0x00 {
+			return 0, errMissingFF00
+		}
+		d.bytes.i++
+		d.bytes.nUnreadable = 2
+		return 0xff, nil
+	}
+
+	x, err = d.readByte()
+	if err != nil {
+		return 0, err
+	}
+	if x != 0xff {
+		d.bytes.nUnreadable = 1
+		return x, nil
+	}
+
+	x, err = d.readByte()
+	if err != nil {
+		d.bytes.nUnreadable = 1
+		return 0, err
+	}
+	d.bytes.nUnreadable = 2
+	if x != 0x00 {
+		return 0, errMissingFF00
+	}
+	return 0xff, nil
+}
+
+// readFull reads exactly len(p) bytes into p. It does not care about byte
+// stuffing.
+func (d *decoder) readFull(p []byte) error {
+	// Unread the overshot bytes, if any.
+	if d.bytes.nUnreadable != 0 {
+		if d.bits.n >= 8 {
+			d.unreadByteStuffedByte()
+		}
+		d.bytes.nUnreadable = 0
+	}
+
+	for {
+		n := copy(p, d.bytes.buf[d.bytes.i:d.bytes.j])
+		p = p[n:]
+		d.bytes.i += n
+		if len(p) == 0 {
+			break
+		}
+		if err := d.fill(); err != nil {
+			if err == io.EOF {
+				err = io.ErrUnexpectedEOF
+			}
+			return err
+		}
+	}
+	return nil
+}
+
+// ignore ignores the next n bytes.
+func (d *decoder) ignore(n int) error {
+	// Unread the overshot bytes, if any.
+	if d.bytes.nUnreadable != 0 {
+		if d.bits.n >= 8 {
+			d.unreadByteStuffedByte()
+		}
+		d.bytes.nUnreadable = 0
+	}
+
+	for {
+		m := d.bytes.j - d.bytes.i
+		if m > n {
+			m = n
+		}
+		d.bytes.i += m
+		n -= m
+		if n == 0 {
+			break
+		}
+		if err := d.fill(); err != nil {
+			if err == io.EOF {
+				err = io.ErrUnexpectedEOF
+			}
+			return err
+		}
+	}
+	return nil
+}
+
+// Specified in section B.2.2.
+func (d *decoder) processSOF(n int) error {
+	switch n {
+	case 6 + 3*nGrayComponent:
+		d.nComp = nGrayComponent
+	case 6 + 3*nColorComponent:
+		d.nComp = nColorComponent
+	default:
+		return UnsupportedError("SOF has wrong length")
+	}
+	if err := d.readFull(d.tmp[:n]); err != nil {
+		return err
+	}
+	// We only support 8-bit precision.
+	if d.tmp[0] != 8 {
+		return UnsupportedError("precision")
+	}
+	d.height = int(d.tmp[1])<<8 + int(d.tmp[2])
+	d.width = int(d.tmp[3])<<8 + int(d.tmp[4])
+	if int(d.tmp[5]) != d.nComp {
+		return UnsupportedError("SOF has wrong number of image components")
+	}
+	for i := 0; i < d.nComp; i++ {
+		d.comp[i].c = d.tmp[6+3*i]
+		d.comp[i].tq = d.tmp[8+3*i]
+		if d.nComp == nGrayComponent {
+			// If a JPEG image has only one component, section A.2 says "this data
+			// is non-interleaved by definition" and section A.2.2 says "[in this
+			// case...] the order of data units within a scan shall be left-to-right
+			// and top-to-bottom... regardless of the values of H_1 and V_1". Section
+			// 4.8.2 also says "[for non-interleaved data], the MCU is defined to be
+			// one data unit". Similarly, section A.1.1 explains that it is the ratio
+			// of H_i to max_j(H_j) that matters, and similarly for V. For grayscale
+			// images, H_1 is the maximum H_j for all components j, so that ratio is
+			// always 1. The component's (h, v) is effectively always (1, 1): even if
+			// the nominal (h, v) is (2, 1), a 20x5 image is encoded in three 8x8
+			// MCUs, not two 16x8 MCUs.
+			d.comp[i].h = 1
+			d.comp[i].v = 1
+			continue
+		}
+		hv := d.tmp[7+3*i]
+		d.comp[i].h = int(hv >> 4)
+		d.comp[i].v = int(hv & 0x0f)
+		// For color images, we only support 4:4:4, 4:4:0, 4:2:2 or 4:2:0 chroma
+		// downsampling ratios. This implies that the (h, v) values for the Y
+		// component are either (1, 1), (1, 2), (2, 1) or (2, 2), and the (h, v)
+		// values for the Cr and Cb components must be (1, 1).
+		if i == 0 {
+			if hv != 0x11 && hv != 0x21 && hv != 0x22 && hv != 0x12 {
+				return UnsupportedError("luma/chroma downsample ratio")
+			}
+		} else if hv != 0x11 {
+			return UnsupportedError("luma/chroma downsample ratio")
+		}
+	}
+	return nil
+}
+
+// Specified in section B.2.4.1.
+func (d *decoder) processDQT(n int) error {
+	const qtLength = 1 + blockSize
+	for ; n >= qtLength; n -= qtLength {
+		if err := d.readFull(d.tmp[:qtLength]); err != nil {
+			return err
+		}
+		pq := d.tmp[0] >> 4
+		if pq != 0 {
+			return UnsupportedError("bad Pq value")
+		}
+		tq := d.tmp[0] & 0x0f
+		if tq > maxTq {
+			return FormatError("bad Tq value")
+		}
+		for i := range d.quant[tq] {
+			d.quant[tq][i] = int32(d.tmp[i+1])
+		}
+	}
+	if n != 0 {
+		return FormatError("DQT has wrong length")
+	}
+	return nil
+}
+
+// Specified in section B.2.4.4.
+func (d *decoder) processDRI(n int) error {
+	if n != 2 {
+		return FormatError("DRI has wrong length")
+	}
+	if err := d.readFull(d.tmp[:2]); err != nil {
+		return err
+	}
+	d.ri = int(d.tmp[0])<<8 + int(d.tmp[1])
+	return nil
+}
+
+// decode reads a JPEG image from r and returns it as an image.Image.
+func (d *decoder) decode(r io.Reader, configOnly bool) (image.Image, error) {
+	d.r = r
+
+	// Check for the Start Of Image marker.
+	if err := d.readFull(d.tmp[:2]); err != nil {
+		return nil, err
+	}
+	if d.tmp[0] != 0xff || d.tmp[1] != soiMarker {
+		return nil, FormatError("missing SOI marker")
+	}
+
+	// Process the remaining segments until the End Of Image marker.
+	for {
+		err := d.readFull(d.tmp[:2])
+		if err != nil {
+			return nil, err
+		}
+		for d.tmp[0] != 0xff {
+			// Strictly speaking, this is a format error. However, libjpeg is
+			// liberal in what it accepts. As of version 9, next_marker in
+			// jdmarker.c treats this as a warning (JWRN_EXTRANEOUS_DATA) and
+			// continues to decode the stream. Even before next_marker sees
+			// extraneous data, jpeg_fill_bit_buffer in jdhuff.c reads as many
+			// bytes as it can, possibly past the end of a scan's data. It
+			// effectively puts back any markers that it overscanned (e.g. an
+			// "\xff\xd9" EOI marker), but it does not put back non-marker data,
+			// and thus it can silently ignore a small number of extraneous
+			// non-marker bytes before next_marker has a chance to see them (and
+			// print a warning).
+			//
+			// We are therefore also liberal in what we accept. Extraneous data
+			// is silently ignored.
+			//
+			// This is similar to, but not exactly the same as, the restart
+			// mechanism within a scan (the RST[0-7] markers).
+			//
+			// Note that extraneous 0xff bytes in e.g. SOS data are escaped as
+			// "\xff\x00", and so are detected a little further down below.
+			d.tmp[0] = d.tmp[1]
+			d.tmp[1], err = d.readByte()
+			if err != nil {
+				return nil, err
+			}
+		}
+		marker := d.tmp[1]
+		if marker == 0 {
+			// Treat "\xff\x00" as extraneous data.
+			continue
+		}
+		for marker == 0xff {
+			// Section B.1.1.2 says, "Any marker may optionally be preceded by any
+			// number of fill bytes, which are bytes assigned code X'FF'".
+			marker, err = d.readByte()
+			if err != nil {
+				return nil, err
+			}
+		}
+		if marker == eoiMarker { // End Of Image.
+			break
+		}
+		if rst0Marker <= marker && marker <= rst7Marker {
+			// Figures B.2 and B.16 of the specification suggest that restart markers should
+			// only occur between Entropy Coded Segments and not after the final ECS.
+			// However, some encoders may generate incorrect JPEGs with a final restart
+			// marker. That restart marker will be seen here instead of inside the processSOS
+			// method, and is ignored as a harmless error. Restart markers have no extra data,
+			// so we check for this before we read the 16-bit length of the segment.
+			continue
+		}
+
+		// Read the 16-bit length of the segment. The value includes the 2 bytes for the
+		// length itself, so we subtract 2 to get the number of remaining bytes.
+		if err = d.readFull(d.tmp[:2]); err != nil {
+			return nil, err
+		}
+		n := int(d.tmp[0])<<8 + int(d.tmp[1]) - 2
+		if n < 0 {
+			return nil, FormatError("short segment length")
+		}
+
+		switch {
+		case marker == sof0Marker || marker == sof2Marker: // Start Of Frame.
+			d.progressive = marker == sof2Marker
+			err = d.processSOF(n)
+			if configOnly {
+				return nil, err
+			}
+		case marker == dhtMarker: // Define Huffman Table.
+			err = d.processDHT(n)
+		case marker == dqtMarker: // Define Quantization Table.
+			err = d.processDQT(n)
+		case marker == sosMarker: // Start Of Scan.
+			err = d.processSOS(n)
+		case marker == driMarker: // Define Restart Interval.
+			err = d.processDRI(n)
+		case app0Marker <= marker && marker <= app15Marker || marker == comMarker: // APPlication specific, or COMment.
+			err = d.ignore(n)
+		default:
+			err = UnsupportedError("unknown marker")
+		}
+		if err != nil {
+			return nil, err
+		}
+	}
+	if d.img1 != nil {
+		return d.img1, nil
+	}
+	if d.img3 != nil {
+		return d.img3, nil
+	}
+	return nil, FormatError("missing SOS marker")
+}
+
+// Decode reads a JPEG image from r and returns it as an image.Image.
+func Decode(r io.Reader) (image.Image, error) {
+	var d decoder
+	return d.decode(r, false)
+}
+
+// DecodeConfig returns the color model and dimensions of a JPEG image without
+// decoding the entire image.
+func DecodeConfig(r io.Reader) (image.Config, error) {
+	var d decoder
+	if _, err := d.decode(r, true); err != nil {
+		return image.Config{}, err
+	}
+	switch d.nComp {
+	case nGrayComponent:
+		return image.Config{
+			ColorModel: color.GrayModel,
+			Width:      d.width,
+			Height:     d.height,
+		}, nil
+	case nColorComponent:
+		return image.Config{
+			ColorModel: color.YCbCrModel,
+			Width:      d.width,
+			Height:     d.height,
+		}, nil
+	}
+	return image.Config{}, FormatError("missing SOF marker")
+}
+
+func init() {
+	image.RegisterFormat("jpeg", "\xff\xd8", Decode, DecodeConfig)
+}
diff --git a/src/image/jpeg/reader_test.go b/src/image/jpeg/reader_test.go
new file mode 100644
index 000000000..93f4adab9
--- /dev/null
+++ b/src/image/jpeg/reader_test.go
@@ -0,0 +1,224 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package jpeg
+
+import (
+	"bytes"
+	"fmt"
+	"image"
+	"image/color"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"strings"
+	"testing"
+)
+
+// TestDecodeProgressive tests that decoding the baseline and progressive
+// versions of the same image result in exactly the same pixel data, in YCbCr
+// space for color images, and Y space for grayscale images.
+func TestDecodeProgressive(t *testing.T) {
+	testCases := []string{
+		"../testdata/video-001",
+		"../testdata/video-001.q50.420",
+		"../testdata/video-001.q50.422",
+		"../testdata/video-001.q50.440",
+		"../testdata/video-001.q50.444",
+		"../testdata/video-005.gray.q50",
+		"../testdata/video-005.gray.q50.2x2",
+		"../testdata/video-001.separate.dc.progression",
+	}
+	for _, tc := range testCases {
+		m0, err := decodeFile(tc + ".jpeg")
+		if err != nil {
+			t.Errorf("%s: %v", tc+".jpeg", err)
+			continue
+		}
+		m1, err := decodeFile(tc + ".progressive.jpeg")
+		if err != nil {
+			t.Errorf("%s: %v", tc+".progressive.jpeg", err)
+			continue
+		}
+		if m0.Bounds() != m1.Bounds() {
+			t.Errorf("%s: bounds differ: %v and %v", tc, m0.Bounds(), m1.Bounds())
+			continue
+		}
+		// All of the video-*.jpeg files are 150x103.
+		if m0.Bounds() != image.Rect(0, 0, 150, 103) {
+			t.Errorf("%s: bad bounds: %v", tc, m0.Bounds())
+			continue
+		}
+
+		switch m0 := m0.(type) {
+		case *image.YCbCr:
+			m1 := m1.(*image.YCbCr)
+			if err := check(m0.Bounds(), m0.Y, m1.Y, m0.YStride, m1.YStride); err != nil {
+				t.Errorf("%s (Y): %v", tc, err)
+				continue
+			}
+			if err := check(m0.Bounds(), m0.Cb, m1.Cb, m0.CStride, m1.CStride); err != nil {
+				t.Errorf("%s (Cb): %v", tc, err)
+				continue
+			}
+			if err := check(m0.Bounds(), m0.Cr, m1.Cr, m0.CStride, m1.CStride); err != nil {
+				t.Errorf("%s (Cr): %v", tc, err)
+				continue
+			}
+		case *image.Gray:
+			m1 := m1.(*image.Gray)
+			if err := check(m0.Bounds(), m0.Pix, m1.Pix, m0.Stride, m1.Stride); err != nil {
+				t.Errorf("%s: %v", tc, err)
+				continue
+			}
+		default:
+			t.Errorf("%s: unexpected image type %T", tc, m0)
+			continue
+		}
+	}
+}
+
+func decodeFile(filename string) (image.Image, error) {
+	f, err := os.Open(filename)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	return Decode(f)
+}
+
+// check checks that the two pix data are equal, within the given bounds.
+func check(bounds image.Rectangle, pix0, pix1 []byte, stride0, stride1 int) error {
+	if stride0 <= 0 || stride0%8 != 0 {
+		return fmt.Errorf("bad stride %d", stride0)
+	}
+	if stride1 <= 0 || stride1%8 != 0 {
+		return fmt.Errorf("bad stride %d", stride1)
+	}
+	// Compare the two pix data, one 8x8 block at a time.
+	for y := 0; y < len(pix0)/stride0 && y < len(pix1)/stride1; y += 8 {
+		for x := 0; x < stride0 && x < stride1; x += 8 {
+			if x >= bounds.Max.X || y >= bounds.Max.Y {
+				// We don't care if the two pix data differ if the 8x8 block is
+				// entirely outside of the image's bounds. For example, this can
+				// occur with a 4:2:0 chroma subsampling and a 1x1 image. Baseline
+				// decoding works on the one 16x16 MCU as a whole; progressive
+				// decoding's first pass works on that 16x16 MCU as a whole but
+				// refinement passes only process one 8x8 block within the MCU.
+				continue
+			}
+
+			for j := 0; j < 8; j++ {
+				for i := 0; i < 8; i++ {
+					index0 := (y+j)*stride0 + (x + i)
+					index1 := (y+j)*stride1 + (x + i)
+					if pix0[index0] != pix1[index1] {
+						return fmt.Errorf("blocks at (%d, %d) differ:\n%sand\n%s", x, y,
+							pixString(pix0, stride0, x, y),
+							pixString(pix1, stride1, x, y),
+						)
+					}
+				}
+			}
+		}
+	}
+	return nil
+}
+
+func pixString(pix []byte, stride, x, y int) string {
+	s := bytes.NewBuffer(nil)
+	for j := 0; j < 8; j++ {
+		fmt.Fprintf(s, "\t")
+		for i := 0; i < 8; i++ {
+			fmt.Fprintf(s, "%02x ", pix[(y+j)*stride+(x+i)])
+		}
+		fmt.Fprintf(s, "\n")
+	}
+	return s.String()
+}
+
+func TestExtraneousData(t *testing.T) {
+	// Encode a 1x1 red image.
+	src := image.NewRGBA(image.Rect(0, 0, 1, 1))
+	src.Set(0, 0, color.RGBA{0xff, 0x00, 0x00, 0xff})
+	buf := new(bytes.Buffer)
+	if err := Encode(buf, src, nil); err != nil {
+		t.Fatalf("encode: %v", err)
+	}
+	enc := buf.String()
+	// Sanity check that the encoded JPEG is long enough, that it ends in a
+	// "\xff\xd9" EOI marker, and that it contains a "\xff\xda" SOS marker
+	// somewhere in the final 64 bytes.
+	if len(enc) < 64 {
+		t.Fatalf("encoded JPEG is too short: %d bytes", len(enc))
+	}
+	if got, want := enc[len(enc)-2:], "\xff\xd9"; got != want {
+		t.Fatalf("encoded JPEG ends with %q, want %q", got, want)
+	}
+	if s := enc[len(enc)-64:]; !strings.Contains(s, "\xff\xda") {
+		t.Fatalf("encoded JPEG does not contain a SOS marker (ff da) near the end: % x", s)
+	}
+	// Test that adding some random junk between the SOS marker and the
+	// EOI marker does not affect the decoding.
+	rnd := rand.New(rand.NewSource(1))
+	for i, nerr := 0, 0; i < 1000 && nerr < 10; i++ {
+		buf.Reset()
+		// Write all but the trailing "\xff\xd9" EOI marker.
+		buf.WriteString(enc[:len(enc)-2])
+		// Write some random extraneous data.
+		for n := rnd.Intn(10); n > 0; n-- {
+			if x := byte(rnd.Intn(256)); x != 0xff {
+				buf.WriteByte(x)
+			} else {
+				// The JPEG format escapes a SOS 0xff data byte as "\xff\x00".
+				buf.WriteString("\xff\x00")
+			}
+		}
+		// Write the "\xff\xd9" EOI marker.
+		buf.WriteString("\xff\xd9")
+
+		// Check that we can still decode the resultant image.
+		got, err := Decode(buf)
+		if err != nil {
+			t.Errorf("could not decode image #%d: %v", i, err)
+			nerr++
+			continue
+		}
+		if got.Bounds() != src.Bounds() {
+			t.Errorf("image #%d, bounds differ: %v and %v", i, got.Bounds(), src.Bounds())
+			nerr++
+			continue
+		}
+		if averageDelta(got, src) > 2<<8 {
+			t.Errorf("image #%d changed too much after a round trip", i)
+			nerr++
+			continue
+		}
+	}
+}
+
+func benchmarkDecode(b *testing.B, filename string) {
+	b.StopTimer()
+	data, err := ioutil.ReadFile(filename)
+	if err != nil {
+		b.Fatal(err)
+	}
+	cfg, err := DecodeConfig(bytes.NewReader(data))
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.SetBytes(int64(cfg.Width * cfg.Height * 4))
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		Decode(bytes.NewReader(data))
+	}
+}
+
+func BenchmarkDecodeBaseline(b *testing.B) {
+	benchmarkDecode(b, "../testdata/video-001.jpeg")
+}
+
+func BenchmarkDecodeProgressive(b *testing.B) {
+	benchmarkDecode(b, "../testdata/video-001.progressive.jpeg")
+}
diff --git a/src/image/jpeg/scan.go b/src/image/jpeg/scan.go
new file mode 100644
index 000000000..8d81b0808
--- /dev/null
+++ b/src/image/jpeg/scan.go
@@ -0,0 +1,436 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package jpeg
+
+import (
+	"image"
+)
+
+// makeImg allocates and initializes the destination image.
+func (d *decoder) makeImg(h0, v0, mxx, myy int) {
+	if d.nComp == nGrayComponent {
+		m := image.NewGray(image.Rect(0, 0, 8*mxx, 8*myy))
+		d.img1 = m.SubImage(image.Rect(0, 0, d.width, d.height)).(*image.Gray)
+		return
+	}
+	var subsampleRatio image.YCbCrSubsampleRatio
+	switch {
+	case h0 == 1 && v0 == 1:
+		subsampleRatio = image.YCbCrSubsampleRatio444
+	case h0 == 1 && v0 == 2:
+		subsampleRatio = image.YCbCrSubsampleRatio440
+	case h0 == 2 && v0 == 1:
+		subsampleRatio = image.YCbCrSubsampleRatio422
+	case h0 == 2 && v0 == 2:
+		subsampleRatio = image.YCbCrSubsampleRatio420
+	default:
+		panic("unreachable")
+	}
+	m := image.NewYCbCr(image.Rect(0, 0, 8*h0*mxx, 8*v0*myy), subsampleRatio)
+	d.img3 = m.SubImage(image.Rect(0, 0, d.width, d.height)).(*image.YCbCr)
+}
+
+// Specified in section B.2.3.
+func (d *decoder) processSOS(n int) error {
+	if d.nComp == 0 {
+		return FormatError("missing SOF marker")
+	}
+	if n < 6 || 4+2*d.nComp < n || n%2 != 0 {
+		return FormatError("SOS has wrong length")
+	}
+	if err := d.readFull(d.tmp[:n]); err != nil {
+		return err
+	}
+	nComp := int(d.tmp[0])
+	if n != 4+2*nComp {
+		return FormatError("SOS length inconsistent with number of components")
+	}
+	var scan [nColorComponent]struct {
+		compIndex uint8
+		td        uint8 // DC table selector.
+		ta        uint8 // AC table selector.
+	}
+	for i := 0; i < nComp; i++ {
+		cs := d.tmp[1+2*i] // Component selector.
+		compIndex := -1
+		for j, comp := range d.comp {
+			if cs == comp.c {
+				compIndex = j
+			}
+		}
+		if compIndex < 0 {
+			return FormatError("unknown component selector")
+		}
+		scan[i].compIndex = uint8(compIndex)
+		scan[i].td = d.tmp[2+2*i] >> 4
+		scan[i].ta = d.tmp[2+2*i] & 0x0f
+	}
+
+	// zigStart and zigEnd are the spectral selection bounds.
+	// ah and al are the successive approximation high and low values.
+	// The spec calls these values Ss, Se, Ah and Al.
+	//
+	// For progressive JPEGs, these are the two more-or-less independent
+	// aspects of progression. Spectral selection progression is when not
+	// all of a block's 64 DCT coefficients are transmitted in one pass.
+	// For example, three passes could transmit coefficient 0 (the DC
+	// component), coefficients 1-5, and coefficients 6-63, in zig-zag
+	// order. Successive approximation is when not all of the bits of a
+	// band of coefficients are transmitted in one pass. For example,
+	// three passes could transmit the 6 most significant bits, followed
+	// by the second-least significant bit, followed by the least
+	// significant bit.
+	//
+	// For baseline JPEGs, these parameters are hard-coded to 0/63/0/0.
+	zigStart, zigEnd, ah, al := int32(0), int32(blockSize-1), uint32(0), uint32(0)
+	if d.progressive {
+		zigStart = int32(d.tmp[1+2*nComp])
+		zigEnd = int32(d.tmp[2+2*nComp])
+		ah = uint32(d.tmp[3+2*nComp] >> 4)
+		al = uint32(d.tmp[3+2*nComp] & 0x0f)
+		if (zigStart == 0 && zigEnd != 0) || zigStart > zigEnd || blockSize <= zigEnd {
+			return FormatError("bad spectral selection bounds")
+		}
+		if zigStart != 0 && nComp != 1 {
+			return FormatError("progressive AC coefficients for more than one component")
+		}
+		if ah != 0 && ah != al+1 {
+			return FormatError("bad successive approximation values")
+		}
+	}
+
+	// mxx and myy are the number of MCUs (Minimum Coded Units) in the image.
+	h0, v0 := d.comp[0].h, d.comp[0].v // The h and v values from the Y components.
+	mxx := (d.width + 8*h0 - 1) / (8 * h0)
+	myy := (d.height + 8*v0 - 1) / (8 * v0)
+	if d.img1 == nil && d.img3 == nil {
+		d.makeImg(h0, v0, mxx, myy)
+	}
+	if d.progressive {
+		for i := 0; i < nComp; i++ {
+			compIndex := scan[i].compIndex
+			if d.progCoeffs[compIndex] == nil {
+				d.progCoeffs[compIndex] = make([]block, mxx*myy*d.comp[compIndex].h*d.comp[compIndex].v)
+			}
+		}
+	}
+
+	d.bits = bits{}
+	mcu, expectedRST := 0, uint8(rst0Marker)
+	var (
+		// b is the decoded coefficients, in natural (not zig-zag) order.
+		b  block
+		dc [nColorComponent]int32
+		// bx and by are the location of the current (in terms of 8x8 blocks).
+		// For example, with 4:2:0 chroma subsampling, the block whose top left
+		// pixel co-ordinates are (16, 8) is the third block in the first row:
+		// bx is 2 and by is 0, even though the pixel is in the second MCU.
+		bx, by     int
+		blockCount int
+	)
+	for my := 0; my < myy; my++ {
+		for mx := 0; mx < mxx; mx++ {
+			for i := 0; i < nComp; i++ {
+				compIndex := scan[i].compIndex
+				qt := &d.quant[d.comp[compIndex].tq]
+				for j := 0; j < d.comp[compIndex].h*d.comp[compIndex].v; j++ {
+					// The blocks are traversed one MCU at a time. For 4:2:0 chroma
+					// subsampling, there are four Y 8x8 blocks in every 16x16 MCU.
+					//
+					// For a baseline 32x16 pixel image, the Y blocks visiting order is:
+					//	0 1 4 5
+					//	2 3 6 7
+					//
+					// For progressive images, the interleaved scans (those with nComp > 1)
+					// are traversed as above, but non-interleaved scans are traversed left
+					// to right, top to bottom:
+					//	0 1 2 3
+					//	4 5 6 7
+					// Only DC scans (zigStart == 0) can be interleaved. AC scans must have
+					// only one component.
+					//
+					// To further complicate matters, for non-interleaved scans, there is no
+					// data for any blocks that are inside the image at the MCU level but
+					// outside the image at the pixel level. For example, a 24x16 pixel 4:2:0
+					// progressive image consists of two 16x16 MCUs. The interleaved scans
+					// will process 8 Y blocks:
+					//	0 1 4 5
+					//	2 3 6 7
+					// The non-interleaved scans will process only 6 Y blocks:
+					//	0 1 2
+					//	3 4 5
+					if nComp != 1 {
+						bx, by = d.comp[compIndex].h*mx, d.comp[compIndex].v*my
+						if h0 == 1 {
+							by += j
+						} else {
+							bx += j % 2
+							by += j / 2
+						}
+					} else {
+						q := mxx * d.comp[compIndex].h
+						bx = blockCount % q
+						by = blockCount / q
+						blockCount++
+						if bx*8 >= d.width || by*8 >= d.height {
+							continue
+						}
+					}
+
+					// Load the previous partially decoded coefficients, if applicable.
+					if d.progressive {
+						b = d.progCoeffs[compIndex][by*mxx*d.comp[compIndex].h+bx]
+					} else {
+						b = block{}
+					}
+
+					if ah != 0 {
+						if err := d.refine(&b, &d.huff[acTable][scan[i].ta], zigStart, zigEnd, 1<<al); err != nil {
+							return err
+						}
+					} else {
+						zig := zigStart
+						if zig == 0 {
+							zig++
+							// Decode the DC coefficient, as specified in section F.2.2.1.
+							value, err := d.decodeHuffman(&d.huff[dcTable][scan[i].td])
+							if err != nil {
+								return err
+							}
+							if value > 16 {
+								return UnsupportedError("excessive DC component")
+							}
+							dcDelta, err := d.receiveExtend(value)
+							if err != nil {
+								return err
+							}
+							dc[compIndex] += dcDelta
+							b[0] = dc[compIndex] << al
+						}
+
+						if zig <= zigEnd && d.eobRun > 0 {
+							d.eobRun--
+						} else {
+							// Decode the AC coefficients, as specified in section F.2.2.2.
+							huff := &d.huff[acTable][scan[i].ta]
+							for ; zig <= zigEnd; zig++ {
+								value, err := d.decodeHuffman(huff)
+								if err != nil {
+									return err
+								}
+								val0 := value >> 4
+								val1 := value & 0x0f
+								if val1 != 0 {
+									zig += int32(val0)
+									if zig > zigEnd {
+										break
+									}
+									ac, err := d.receiveExtend(val1)
+									if err != nil {
+										return err
+									}
+									b[unzig[zig]] = ac << al
+								} else {
+									if val0 != 0x0f {
+										d.eobRun = uint16(1 << val0)
+										if val0 != 0 {
+											bits, err := d.decodeBits(int32(val0))
+											if err != nil {
+												return err
+											}
+											d.eobRun |= uint16(bits)
+										}
+										d.eobRun--
+										break
+									}
+									zig += 0x0f
+								}
+							}
+						}
+					}
+
+					if d.progressive {
+						if zigEnd != blockSize-1 || al != 0 {
+							// We haven't completely decoded this 8x8 block. Save the coefficients.
+							d.progCoeffs[compIndex][by*mxx*d.comp[compIndex].h+bx] = b
+							// At this point, we could execute the rest of the loop body to dequantize and
+							// perform the inverse DCT, to save early stages of a progressive image to the
+							// *image.YCbCr buffers (the whole point of progressive encoding), but in Go,
+							// the jpeg.Decode function does not return until the entire image is decoded,
+							// so we "continue" here to avoid wasted computation.
+							continue
+						}
+					}
+
+					// Dequantize, perform the inverse DCT and store the block to the image.
+					for zig := 0; zig < blockSize; zig++ {
+						b[unzig[zig]] *= qt[zig]
+					}
+					idct(&b)
+					dst, stride := []byte(nil), 0
+					if d.nComp == nGrayComponent {
+						dst, stride = d.img1.Pix[8*(by*d.img1.Stride+bx):], d.img1.Stride
+					} else {
+						switch compIndex {
+						case 0:
+							dst, stride = d.img3.Y[8*(by*d.img3.YStride+bx):], d.img3.YStride
+						case 1:
+							dst, stride = d.img3.Cb[8*(by*d.img3.CStride+bx):], d.img3.CStride
+						case 2:
+							dst, stride = d.img3.Cr[8*(by*d.img3.CStride+bx):], d.img3.CStride
+						default:
+							return UnsupportedError("too many components")
+						}
+					}
+					// Level shift by +128, clip to [0, 255], and write to dst.
+					for y := 0; y < 8; y++ {
+						y8 := y * 8
+						yStride := y * stride
+						for x := 0; x < 8; x++ {
+							c := b[y8+x]
+							if c < -128 {
+								c = 0
+							} else if c > 127 {
+								c = 255
+							} else {
+								c += 128
+							}
+							dst[yStride+x] = uint8(c)
+						}
+					}
+				} // for j
+			} // for i
+			mcu++
+			if d.ri > 0 && mcu%d.ri == 0 && mcu < mxx*myy {
+				// A more sophisticated decoder could use RST[0-7] markers to resynchronize from corrupt input,
+				// but this one assumes well-formed input, and hence the restart marker follows immediately.
+				if err := d.readFull(d.tmp[:2]); err != nil {
+					return err
+				}
+				if d.tmp[0] != 0xff || d.tmp[1] != expectedRST {
+					return FormatError("bad RST marker")
+				}
+				expectedRST++
+				if expectedRST == rst7Marker+1 {
+					expectedRST = rst0Marker
+				}
+				// Reset the Huffman decoder.
+				d.bits = bits{}
+				// Reset the DC components, as per section F.2.1.3.1.
+				dc = [nColorComponent]int32{}
+				// Reset the progressive decoder state, as per section G.1.2.2.
+				d.eobRun = 0
+			}
+		} // for mx
+	} // for my
+
+	return nil
+}
+
+// refine decodes a successive approximation refinement block, as specified in
+// section G.1.2.
+func (d *decoder) refine(b *block, h *huffman, zigStart, zigEnd, delta int32) error {
+	// Refining a DC component is trivial.
+	if zigStart == 0 {
+		if zigEnd != 0 {
+			panic("unreachable")
+		}
+		bit, err := d.decodeBit()
+		if err != nil {
+			return err
+		}
+		if bit {
+			b[0] |= delta
+		}
+		return nil
+	}
+
+	// Refining AC components is more complicated; see sections G.1.2.2 and G.1.2.3.
+	zig := zigStart
+	if d.eobRun == 0 {
+	loop:
+		for ; zig <= zigEnd; zig++ {
+			z := int32(0)
+			value, err := d.decodeHuffman(h)
+			if err != nil {
+				return err
+			}
+			val0 := value >> 4
+			val1 := value & 0x0f
+
+			switch val1 {
+			case 0:
+				if val0 != 0x0f {
+					d.eobRun = uint16(1 << val0)
+					if val0 != 0 {
+						bits, err := d.decodeBits(int32(val0))
+						if err != nil {
+							return err
+						}
+						d.eobRun |= uint16(bits)
+					}
+					break loop
+				}
+			case 1:
+				z = delta
+				bit, err := d.decodeBit()
+				if err != nil {
+					return err
+				}
+				if !bit {
+					z = -z
+				}
+			default:
+				return FormatError("unexpected Huffman code")
+			}
+
+			zig, err = d.refineNonZeroes(b, zig, zigEnd, int32(val0), delta)
+			if err != nil {
+				return err
+			}
+			if zig > zigEnd {
+				return FormatError("too many coefficients")
+			}
+			if z != 0 {
+				b[unzig[zig]] = z
+			}
+		}
+	}
+	if d.eobRun > 0 {
+		d.eobRun--
+		if _, err := d.refineNonZeroes(b, zig, zigEnd, -1, delta); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// refineNonZeroes refines non-zero entries of b in zig-zag order. If nz >= 0,
+// the first nz zero entries are skipped over.
+func (d *decoder) refineNonZeroes(b *block, zig, zigEnd, nz, delta int32) (int32, error) {
+	for ; zig <= zigEnd; zig++ {
+		u := unzig[zig]
+		if b[u] == 0 {
+			if nz == 0 {
+				break
+			}
+			nz--
+			continue
+		}
+		bit, err := d.decodeBit()
+		if err != nil {
+			return 0, err
+		}
+		if !bit {
+			continue
+		}
+		if b[u] >= 0 {
+			b[u] += delta
+		} else {
+			b[u] -= delta
+		}
+	}
+	return zig, nil
+}
diff --git a/src/image/jpeg/writer.go b/src/image/jpeg/writer.go
new file mode 100644
index 000000000..91bbde3bf
--- /dev/null
+++ b/src/image/jpeg/writer.go
@@ -0,0 +1,614 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package jpeg
+
+import (
+	"bufio"
+	"errors"
+	"image"
+	"image/color"
+	"io"
+)
+
+// min returns the minimum of two integers.
+func min(x, y int) int {
+	if x < y {
+		return x
+	}
+	return y
+}
+
+// div returns a/b rounded to the nearest integer, instead of rounded to zero.
+func div(a, b int32) int32 {
+	if a >= 0 {
+		return (a + (b >> 1)) / b
+	}
+	return -((-a + (b >> 1)) / b)
+}
+
+// bitCount counts the number of bits needed to hold an integer.
+var bitCount = [256]byte{
+	0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+	8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+	8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+	8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+	8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+	8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+	8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+	8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+	8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+}
+
+type quantIndex int
+
+const (
+	quantIndexLuminance quantIndex = iota
+	quantIndexChrominance
+	nQuantIndex
+)
+
+// unscaledQuant are the unscaled quantization tables in zig-zag order. Each
+// encoder copies and scales the tables according to its quality parameter.
+// The values are derived from section K.1 after converting from natural to
+// zig-zag order.
+var unscaledQuant = [nQuantIndex][blockSize]byte{
+	// Luminance.
+	{
+		16, 11, 12, 14, 12, 10, 16, 14,
+		13, 14, 18, 17, 16, 19, 24, 40,
+		26, 24, 22, 22, 24, 49, 35, 37,
+		29, 40, 58, 51, 61, 60, 57, 51,
+		56, 55, 64, 72, 92, 78, 64, 68,
+		87, 69, 55, 56, 80, 109, 81, 87,
+		95, 98, 103, 104, 103, 62, 77, 113,
+		121, 112, 100, 120, 92, 101, 103, 99,
+	},
+	// Chrominance.
+	{
+		17, 18, 18, 24, 21, 24, 47, 26,
+		26, 47, 99, 66, 56, 66, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+	},
+}
+
+type huffIndex int
+
+const (
+	huffIndexLuminanceDC huffIndex = iota
+	huffIndexLuminanceAC
+	huffIndexChrominanceDC
+	huffIndexChrominanceAC
+	nHuffIndex
+)
+
+// huffmanSpec specifies a Huffman encoding.
+type huffmanSpec struct {
+	// count[i] is the number of codes of length i bits.
+	count [16]byte
+	// value[i] is the decoded value of the i'th codeword.
+	value []byte
+}
+
+// theHuffmanSpec is the Huffman encoding specifications.
+// This encoder uses the same Huffman encoding for all images.
+var theHuffmanSpec = [nHuffIndex]huffmanSpec{
+	// Luminance DC.
+	{
+		[16]byte{0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0},
+		[]byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+	},
+	// Luminance AC.
+	{
+		[16]byte{0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 125},
+		[]byte{
+			0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
+			0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
+			0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
+			0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
+			0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
+			0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+			0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+			0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+			0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+			0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+			0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+			0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+			0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+			0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+			0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
+			0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
+			0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
+			0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+			0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
+			0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+			0xf9, 0xfa,
+		},
+	},
+	// Chrominance DC.
+	{
+		[16]byte{0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0},
+		[]byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+	},
+	// Chrominance AC.
+	{
+		[16]byte{0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 119},
+		[]byte{
+			0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
+			0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
+			0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
+			0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
+			0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
+			0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+			0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
+			0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+			0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+			0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+			0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
+			0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+			0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
+			0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
+			0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
+			0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
+			0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
+			0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+			0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
+			0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+			0xf9, 0xfa,
+		},
+	},
+}
+
+// huffmanLUT is a compiled look-up table representation of a huffmanSpec.
+// Each value maps to a uint32 of which the 8 most significant bits hold the
+// codeword size in bits and the 24 least significant bits hold the codeword.
+// The maximum codeword size is 16 bits.
+type huffmanLUT []uint32
+
+func (h *huffmanLUT) init(s huffmanSpec) {
+	maxValue := 0
+	for _, v := range s.value {
+		if int(v) > maxValue {
+			maxValue = int(v)
+		}
+	}
+	*h = make([]uint32, maxValue+1)
+	code, k := uint32(0), 0
+	for i := 0; i < len(s.count); i++ {
+		nBits := uint32(i+1) << 24
+		for j := uint8(0); j < s.count[i]; j++ {
+			(*h)[s.value[k]] = nBits | code
+			code++
+			k++
+		}
+		code <<= 1
+	}
+}
+
+// theHuffmanLUT are compiled representations of theHuffmanSpec.
+var theHuffmanLUT [4]huffmanLUT
+
+func init() {
+	for i, s := range theHuffmanSpec {
+		theHuffmanLUT[i].init(s)
+	}
+}
+
+// writer is a buffered writer.
+type writer interface {
+	Flush() error
+	io.Writer
+	io.ByteWriter
+}
+
+// encoder encodes an image to the JPEG format.
+type encoder struct {
+	// w is the writer to write to. err is the first error encountered during
+	// writing. All attempted writes after the first error become no-ops.
+	w   writer
+	err error
+	// buf is a scratch buffer.
+	buf [16]byte
+	// bits and nBits are accumulated bits to write to w.
+	bits, nBits uint32
+	// quant is the scaled quantization tables, in zig-zag order.
+	quant [nQuantIndex][blockSize]byte
+}
+
+func (e *encoder) flush() {
+	if e.err != nil {
+		return
+	}
+	e.err = e.w.Flush()
+}
+
+func (e *encoder) write(p []byte) {
+	if e.err != nil {
+		return
+	}
+	_, e.err = e.w.Write(p)
+}
+
+func (e *encoder) writeByte(b byte) {
+	if e.err != nil {
+		return
+	}
+	e.err = e.w.WriteByte(b)
+}
+
+// emit emits the least significant nBits bits of bits to the bit-stream.
+// The precondition is bits < 1<<nBits && nBits <= 16.
+func (e *encoder) emit(bits, nBits uint32) {
+	nBits += e.nBits
+	bits <<= 32 - nBits
+	bits |= e.bits
+	for nBits >= 8 {
+		b := uint8(bits >> 24)
+		e.writeByte(b)
+		if b == 0xff {
+			e.writeByte(0x00)
+		}
+		bits <<= 8
+		nBits -= 8
+	}
+	e.bits, e.nBits = bits, nBits
+}
+
+// emitHuff emits the given value with the given Huffman encoder.
+func (e *encoder) emitHuff(h huffIndex, value int32) {
+	x := theHuffmanLUT[h][value]
+	e.emit(x&(1<<24-1), x>>24)
+}
+
+// emitHuffRLE emits a run of runLength copies of value encoded with the given
+// Huffman encoder.
+func (e *encoder) emitHuffRLE(h huffIndex, runLength, value int32) {
+	a, b := value, value
+	if a < 0 {
+		a, b = -value, value-1
+	}
+	var nBits uint32
+	if a < 0x100 {
+		nBits = uint32(bitCount[a])
+	} else {
+		nBits = 8 + uint32(bitCount[a>>8])
+	}
+	e.emitHuff(h, runLength<<4|int32(nBits))
+	if nBits > 0 {
+		e.emit(uint32(b)&(1<<nBits-1), nBits)
+	}
+}
+
+// writeMarkerHeader writes the header for a marker with the given length.
+func (e *encoder) writeMarkerHeader(marker uint8, markerlen int) {
+	e.buf[0] = 0xff
+	e.buf[1] = marker
+	e.buf[2] = uint8(markerlen >> 8)
+	e.buf[3] = uint8(markerlen & 0xff)
+	e.write(e.buf[:4])
+}
+
+// writeDQT writes the Define Quantization Table marker.
+func (e *encoder) writeDQT() {
+	const markerlen = 2 + int(nQuantIndex)*(1+blockSize)
+	e.writeMarkerHeader(dqtMarker, markerlen)
+	for i := range e.quant {
+		e.writeByte(uint8(i))
+		e.write(e.quant[i][:])
+	}
+}
+
+// writeSOF0 writes the Start Of Frame (Baseline) marker.
+func (e *encoder) writeSOF0(size image.Point, nComponent int) {
+	markerlen := 8 + 3*nComponent
+	e.writeMarkerHeader(sof0Marker, markerlen)
+	e.buf[0] = 8 // 8-bit color.
+	e.buf[1] = uint8(size.Y >> 8)
+	e.buf[2] = uint8(size.Y & 0xff)
+	e.buf[3] = uint8(size.X >> 8)
+	e.buf[4] = uint8(size.X & 0xff)
+	e.buf[5] = uint8(nComponent)
+	if nComponent == 1 {
+		e.buf[6] = 1
+		// No subsampling for grayscale image.
+		e.buf[7] = 0x11
+		e.buf[8] = 0x00
+	} else {
+		for i := 0; i < nComponent; i++ {
+			e.buf[3*i+6] = uint8(i + 1)
+			// We use 4:2:0 chroma subsampling.
+			e.buf[3*i+7] = "\x22\x11\x11"[i]
+			e.buf[3*i+8] = "\x00\x01\x01"[i]
+		}
+	}
+	e.write(e.buf[:3*(nComponent-1)+9])
+}
+
+// writeDHT writes the Define Huffman Table marker.
+func (e *encoder) writeDHT(nComponent int) {
+	markerlen := 2
+	specs := theHuffmanSpec[:]
+	if nComponent == 1 {
+		// Drop the Chrominance tables.
+		specs = specs[:2]
+	}
+	for _, s := range specs {
+		markerlen += 1 + 16 + len(s.value)
+	}
+	e.writeMarkerHeader(dhtMarker, markerlen)
+	for i, s := range specs {
+		e.writeByte("\x00\x10\x01\x11"[i])
+		e.write(s.count[:])
+		e.write(s.value)
+	}
+}
+
+// writeBlock writes a block of pixel data using the given quantization table,
+// returning the post-quantized DC value of the DCT-transformed block. b is in
+// natural (not zig-zag) order.
+func (e *encoder) writeBlock(b *block, q quantIndex, prevDC int32) int32 {
+	fdct(b)
+	// Emit the DC delta.
+	dc := div(b[0], 8*int32(e.quant[q][0]))
+	e.emitHuffRLE(huffIndex(2*q+0), 0, dc-prevDC)
+	// Emit the AC components.
+	h, runLength := huffIndex(2*q+1), int32(0)
+	for zig := 1; zig < blockSize; zig++ {
+		ac := div(b[unzig[zig]], 8*int32(e.quant[q][zig]))
+		if ac == 0 {
+			runLength++
+		} else {
+			for runLength > 15 {
+				e.emitHuff(h, 0xf0)
+				runLength -= 16
+			}
+			e.emitHuffRLE(h, runLength, ac)
+			runLength = 0
+		}
+	}
+	if runLength > 0 {
+		e.emitHuff(h, 0x00)
+	}
+	return dc
+}
+
+// toYCbCr converts the 8x8 region of m whose top-left corner is p to its
+// YCbCr values.
+func toYCbCr(m image.Image, p image.Point, yBlock, cbBlock, crBlock *block) {
+	b := m.Bounds()
+	xmax := b.Max.X - 1
+	ymax := b.Max.Y - 1
+	for j := 0; j < 8; j++ {
+		for i := 0; i < 8; i++ {
+			r, g, b, _ := m.At(min(p.X+i, xmax), min(p.Y+j, ymax)).RGBA()
+			yy, cb, cr := color.RGBToYCbCr(uint8(r>>8), uint8(g>>8), uint8(b>>8))
+			yBlock[8*j+i] = int32(yy)
+			cbBlock[8*j+i] = int32(cb)
+			crBlock[8*j+i] = int32(cr)
+		}
+	}
+}
+
+// grayToY stores the 8x8 region of m whose top-left corner is p in yBlock.
+func grayToY(m *image.Gray, p image.Point, yBlock *block) {
+	b := m.Bounds()
+	xmax := b.Max.X - 1
+	ymax := b.Max.Y - 1
+	pix := m.Pix
+	for j := 0; j < 8; j++ {
+		for i := 0; i < 8; i++ {
+			idx := m.PixOffset(min(p.X+i, xmax), min(p.Y+j, ymax))
+			yBlock[8*j+i] = int32(pix[idx])
+		}
+	}
+}
+
+// rgbaToYCbCr is a specialized version of toYCbCr for image.RGBA images.
+func rgbaToYCbCr(m *image.RGBA, p image.Point, yBlock, cbBlock, crBlock *block) {
+	b := m.Bounds()
+	xmax := b.Max.X - 1
+	ymax := b.Max.Y - 1
+	for j := 0; j < 8; j++ {
+		sj := p.Y + j
+		if sj > ymax {
+			sj = ymax
+		}
+		offset := (sj-b.Min.Y)*m.Stride - b.Min.X*4
+		for i := 0; i < 8; i++ {
+			sx := p.X + i
+			if sx > xmax {
+				sx = xmax
+			}
+			pix := m.Pix[offset+sx*4:]
+			yy, cb, cr := color.RGBToYCbCr(pix[0], pix[1], pix[2])
+			yBlock[8*j+i] = int32(yy)
+			cbBlock[8*j+i] = int32(cb)
+			crBlock[8*j+i] = int32(cr)
+		}
+	}
+}
+
+// scale scales the 16x16 region represented by the 4 src blocks to the 8x8
+// dst block.
+func scale(dst *block, src *[4]block) {
+	for i := 0; i < 4; i++ {
+		dstOff := (i&2)<<4 | (i&1)<<2
+		for y := 0; y < 4; y++ {
+			for x := 0; x < 4; x++ {
+				j := 16*y + 2*x
+				sum := src[i][j] + src[i][j+1] + src[i][j+8] + src[i][j+9]
+				dst[8*y+x+dstOff] = (sum + 2) >> 2
+			}
+		}
+	}
+}
+
+// sosHeaderY is the SOS marker "\xff\xda" followed by 8 bytes:
+//	- the marker length "\x00\x08",
+//	- the number of components "\x01",
+//	- component 1 uses DC table 0 and AC table 0 "\x01\x00",
+//	- the bytes "\x00\x3f\x00". Section B.2.3 of the spec says that for
+//	  sequential DCTs, those bytes (8-bit Ss, 8-bit Se, 4-bit Ah, 4-bit Al)
+//	  should be 0x00, 0x3f, 0x00<<4 | 0x00.
+var sosHeaderY = []byte{
+	0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x00, 0x3f, 0x00,
+}
+
+// sosHeaderYCbCr is the SOS marker "\xff\xda" followed by 12 bytes:
+//	- the marker length "\x00\x0c",
+//	- the number of components "\x03",
+//	- component 1 uses DC table 0 and AC table 0 "\x01\x00",
+//	- component 2 uses DC table 1 and AC table 1 "\x02\x11",
+//	- component 3 uses DC table 1 and AC table 1 "\x03\x11",
+//	- the bytes "\x00\x3f\x00". Section B.2.3 of the spec says that for
+//	  sequential DCTs, those bytes (8-bit Ss, 8-bit Se, 4-bit Ah, 4-bit Al)
+//	  should be 0x00, 0x3f, 0x00<<4 | 0x00.
+var sosHeaderYCbCr = []byte{
+	0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02,
+	0x11, 0x03, 0x11, 0x00, 0x3f, 0x00,
+}
+
+// writeSOS writes the StartOfScan marker.
+func (e *encoder) writeSOS(m image.Image) {
+	switch m.(type) {
+	case *image.Gray:
+		e.write(sosHeaderY)
+	default:
+		e.write(sosHeaderYCbCr)
+	}
+	var (
+		// Scratch buffers to hold the YCbCr values.
+		// The blocks are in natural (not zig-zag) order.
+		b      block
+		cb, cr [4]block
+		// DC components are delta-encoded.
+		prevDCY, prevDCCb, prevDCCr int32
+	)
+	bounds := m.Bounds()
+	switch m := m.(type) {
+	// TODO(wathiede): switch on m.ColorModel() instead of type.
+	case *image.Gray:
+		for y := bounds.Min.Y; y < bounds.Max.Y; y += 8 {
+			for x := bounds.Min.X; x < bounds.Max.X; x += 8 {
+				p := image.Pt(x, y)
+				grayToY(m, p, &b)
+				prevDCY = e.writeBlock(&b, 0, prevDCY)
+			}
+		}
+	default:
+		rgba, _ := m.(*image.RGBA)
+		for y := bounds.Min.Y; y < bounds.Max.Y; y += 16 {
+			for x := bounds.Min.X; x < bounds.Max.X; x += 16 {
+				for i := 0; i < 4; i++ {
+					xOff := (i & 1) * 8
+					yOff := (i & 2) * 4
+					p := image.Pt(x+xOff, y+yOff)
+					if rgba != nil {
+						rgbaToYCbCr(rgba, p, &b, &cb[i], &cr[i])
+					} else {
+						toYCbCr(m, p, &b, &cb[i], &cr[i])
+					}
+					prevDCY = e.writeBlock(&b, 0, prevDCY)
+				}
+				scale(&b, &cb)
+				prevDCCb = e.writeBlock(&b, 1, prevDCCb)
+				scale(&b, &cr)
+				prevDCCr = e.writeBlock(&b, 1, prevDCCr)
+			}
+		}
+	}
+	// Pad the last byte with 1's.
+	e.emit(0x7f, 7)
+}
+
+// DefaultQuality is the default quality encoding parameter.
+const DefaultQuality = 75
+
+// Options are the encoding parameters.
+// Quality ranges from 1 to 100 inclusive, higher is better.
+type Options struct {
+	Quality int
+}
+
+// Encode writes the Image m to w in JPEG 4:2:0 baseline format with the given
+// options. Default parameters are used if a nil *Options is passed.
+func Encode(w io.Writer, m image.Image, o *Options) error {
+	b := m.Bounds()
+	if b.Dx() >= 1<<16 || b.Dy() >= 1<<16 {
+		return errors.New("jpeg: image is too large to encode")
+	}
+	var e encoder
+	if ww, ok := w.(writer); ok {
+		e.w = ww
+	} else {
+		e.w = bufio.NewWriter(w)
+	}
+	// Clip quality to [1, 100].
+	quality := DefaultQuality
+	if o != nil {
+		quality = o.Quality
+		if quality < 1 {
+			quality = 1
+		} else if quality > 100 {
+			quality = 100
+		}
+	}
+	// Convert from a quality rating to a scaling factor.
+	var scale int
+	if quality < 50 {
+		scale = 5000 / quality
+	} else {
+		scale = 200 - quality*2
+	}
+	// Initialize the quantization tables.
+	for i := range e.quant {
+		for j := range e.quant[i] {
+			x := int(unscaledQuant[i][j])
+			x = (x*scale + 50) / 100
+			if x < 1 {
+				x = 1
+			} else if x > 255 {
+				x = 255
+			}
+			e.quant[i][j] = uint8(x)
+		}
+	}
+	// Compute number of components based on input image type.
+	nComponent := 3
+	switch m.(type) {
+	// TODO(wathiede): switch on m.ColorModel() instead of type.
+	case *image.Gray:
+		nComponent = 1
+	}
+	// Write the Start Of Image marker.
+	e.buf[0] = 0xff
+	e.buf[1] = 0xd8
+	e.write(e.buf[:2])
+	// Write the quantization tables.
+	e.writeDQT()
+	// Write the image dimensions.
+	e.writeSOF0(b.Size(), nComponent)
+	// Write the Huffman tables.
+	e.writeDHT(nComponent)
+	// Write the image data.
+	e.writeSOS(m)
+	// Write the End Of Image marker.
+	e.buf[0] = 0xff
+	e.buf[1] = 0xd9
+	e.write(e.buf[:2])
+	e.flush()
+	return e.err
+}
diff --git a/src/image/jpeg/writer_test.go b/src/image/jpeg/writer_test.go
new file mode 100644
index 000000000..3df3cfcc5
--- /dev/null
+++ b/src/image/jpeg/writer_test.go
@@ -0,0 +1,232 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package jpeg
+
+import (
+	"bytes"
+	"fmt"
+	"image"
+	"image/color"
+	"image/png"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"testing"
+)
+
+// zigzag maps from the natural ordering to the zig-zag ordering. For example,
+// zigzag[0*8 + 3] is the zig-zag sequence number of the element in the fourth
+// column and first row.
+var zigzag = [blockSize]int{
+	0, 1, 5, 6, 14, 15, 27, 28,
+	2, 4, 7, 13, 16, 26, 29, 42,
+	3, 8, 12, 17, 25, 30, 41, 43,
+	9, 11, 18, 24, 31, 40, 44, 53,
+	10, 19, 23, 32, 39, 45, 52, 54,
+	20, 22, 33, 38, 46, 51, 55, 60,
+	21, 34, 37, 47, 50, 56, 59, 61,
+	35, 36, 48, 49, 57, 58, 62, 63,
+}
+
+func TestZigUnzig(t *testing.T) {
+	for i := 0; i < blockSize; i++ {
+		if unzig[zigzag[i]] != i {
+			t.Errorf("unzig[zigzag[%d]] == %d", i, unzig[zigzag[i]])
+		}
+		if zigzag[unzig[i]] != i {
+			t.Errorf("zigzag[unzig[%d]] == %d", i, zigzag[unzig[i]])
+		}
+	}
+}
+
+// unscaledQuantInNaturalOrder are the unscaled quantization tables in
+// natural (not zig-zag) order, as specified in section K.1.
+var unscaledQuantInNaturalOrder = [nQuantIndex][blockSize]byte{
+	// Luminance.
+	{
+		16, 11, 10, 16, 24, 40, 51, 61,
+		12, 12, 14, 19, 26, 58, 60, 55,
+		14, 13, 16, 24, 40, 57, 69, 56,
+		14, 17, 22, 29, 51, 87, 80, 62,
+		18, 22, 37, 56, 68, 109, 103, 77,
+		24, 35, 55, 64, 81, 104, 113, 92,
+		49, 64, 78, 87, 103, 121, 120, 101,
+		72, 92, 95, 98, 112, 100, 103, 99,
+	},
+	// Chrominance.
+	{
+		17, 18, 24, 47, 99, 99, 99, 99,
+		18, 21, 26, 66, 99, 99, 99, 99,
+		24, 26, 56, 99, 99, 99, 99, 99,
+		47, 66, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+		99, 99, 99, 99, 99, 99, 99, 99,
+	},
+}
+
+func TestUnscaledQuant(t *testing.T) {
+	bad := false
+	for i := quantIndex(0); i < nQuantIndex; i++ {
+		for zig := 0; zig < blockSize; zig++ {
+			got := unscaledQuant[i][zig]
+			want := unscaledQuantInNaturalOrder[i][unzig[zig]]
+			if got != want {
+				t.Errorf("i=%d, zig=%d: got %d, want %d", i, zig, got, want)
+				bad = true
+			}
+		}
+	}
+	if bad {
+		names := [nQuantIndex]string{"Luminance", "Chrominance"}
+		buf := &bytes.Buffer{}
+		for i, name := range names {
+			fmt.Fprintf(buf, "// %s.\n{\n", name)
+			for zig := 0; zig < blockSize; zig++ {
+				fmt.Fprintf(buf, "%d, ", unscaledQuantInNaturalOrder[i][unzig[zig]])
+				if zig%8 == 7 {
+					buf.WriteString("\n")
+				}
+			}
+			buf.WriteString("},\n")
+		}
+		t.Logf("expected unscaledQuant values:\n%s", buf.String())
+	}
+}
+
+var testCase = []struct {
+	filename  string
+	quality   int
+	tolerance int64
+}{
+	{"../testdata/video-001.png", 1, 24 << 8},
+	{"../testdata/video-001.png", 20, 12 << 8},
+	{"../testdata/video-001.png", 60, 8 << 8},
+	{"../testdata/video-001.png", 80, 6 << 8},
+	{"../testdata/video-001.png", 90, 4 << 8},
+	{"../testdata/video-001.png", 100, 2 << 8},
+}
+
+func delta(u0, u1 uint32) int64 {
+	d := int64(u0) - int64(u1)
+	if d < 0 {
+		return -d
+	}
+	return d
+}
+
+func readPng(filename string) (image.Image, error) {
+	f, err := os.Open(filename)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	return png.Decode(f)
+}
+
+func TestWriter(t *testing.T) {
+	for _, tc := range testCase {
+		// Read the image.
+		m0, err := readPng(tc.filename)
+		if err != nil {
+			t.Error(tc.filename, err)
+			continue
+		}
+		// Encode that image as JPEG.
+		var buf bytes.Buffer
+		err = Encode(&buf, m0, &Options{Quality: tc.quality})
+		if err != nil {
+			t.Error(tc.filename, err)
+			continue
+		}
+		// Decode that JPEG.
+		m1, err := Decode(&buf)
+		if err != nil {
+			t.Error(tc.filename, err)
+			continue
+		}
+		if m0.Bounds() != m1.Bounds() {
+			t.Errorf("%s, bounds differ: %v and %v", tc.filename, m0.Bounds(), m1.Bounds())
+			continue
+		}
+		// Compare the average delta to the tolerance level.
+		if averageDelta(m0, m1) > tc.tolerance {
+			t.Errorf("%s, quality=%d: average delta is too high", tc.filename, tc.quality)
+			continue
+		}
+	}
+}
+
+// TestWriteGrayscale tests that a grayscale images survives a round-trip
+// through encode/decode cycle.
+func TestWriteGrayscale(t *testing.T) {
+	m0 := image.NewGray(image.Rect(0, 0, 32, 32))
+	for i := range m0.Pix {
+		m0.Pix[i] = uint8(i)
+	}
+	var buf bytes.Buffer
+	if err := Encode(&buf, m0, nil); err != nil {
+		t.Fatal(err)
+	}
+	m1, err := Decode(&buf)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if m0.Bounds() != m1.Bounds() {
+		t.Fatalf("bounds differ: %v and %v", m0.Bounds(), m1.Bounds())
+	}
+	if _, ok := m1.(*image.Gray); !ok {
+		t.Errorf("got %T, want *image.Gray", m1)
+	}
+	// Compare the average delta to the tolerance level.
+	want := int64(2 << 8)
+	if got := averageDelta(m0, m1); got > want {
+		t.Errorf("average delta too high; got %d, want <= %d", got, want)
+	}
+}
+
+// averageDelta returns the average delta in RGB space. The two images must
+// have the same bounds.
+func averageDelta(m0, m1 image.Image) int64 {
+	b := m0.Bounds()
+	var sum, n int64
+	for y := b.Min.Y; y < b.Max.Y; y++ {
+		for x := b.Min.X; x < b.Max.X; x++ {
+			c0 := m0.At(x, y)
+			c1 := m1.At(x, y)
+			r0, g0, b0, _ := c0.RGBA()
+			r1, g1, b1, _ := c1.RGBA()
+			sum += delta(r0, r1)
+			sum += delta(g0, g1)
+			sum += delta(b0, b1)
+			n += 3
+		}
+	}
+	return sum / n
+}
+
+func BenchmarkEncode(b *testing.B) {
+	b.StopTimer()
+	img := image.NewRGBA(image.Rect(0, 0, 640, 480))
+	bo := img.Bounds()
+	rnd := rand.New(rand.NewSource(123))
+	for y := bo.Min.Y; y < bo.Max.Y; y++ {
+		for x := bo.Min.X; x < bo.Max.X; x++ {
+			img.SetRGBA(x, y, color.RGBA{
+				uint8(rnd.Intn(256)),
+				uint8(rnd.Intn(256)),
+				uint8(rnd.Intn(256)),
+				255,
+			})
+		}
+	}
+	b.SetBytes(640 * 480 * 4)
+	b.StartTimer()
+	options := &Options{Quality: 90}
+	for i := 0; i < b.N; i++ {
+		Encode(ioutil.Discard, img, options)
+	}
+}