diff --git a/src/decoder.rs b/src/decoder.rs
index c0fef09a..69c9f0c1 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -100,8 +100,8 @@ impl<R: Read> Decoder<R> {
                 };
 
                 Some(ImageInfo {
-                    width: frame.image_size.width,
-                    height: frame.image_size.height,
+                    width: frame.output_size.width,
+                    height: frame.output_size.height,
                     pixel_format: pixel_format,
                 })
             },
@@ -116,6 +116,23 @@ impl<R: Read> Decoder<R> {
         self.decode_internal(true).map(|_| ())
     }
 
+    /// Configure the decoder to scale the image during decoding.
+    /// 
+    /// This efficiently scales the image by the smallest supported scale
+    /// factor that produces an image larger than or equal to the requested
+    /// size in at least one axis. The currently implemented scale factors
+    /// are 1/8, 1/4, 1/2 and 1.
+    /// 
+    /// To generate a thumbnail of an exact size, pass the desired size and
+    /// then scale to the final size using a traditional resampling algorithm.
+    pub fn scale(&mut self, requested_width: u16, requested_height: u16) -> Result<(u16, u16)> {
+        self.read_info()?;
+        let frame = self.frame.as_mut().unwrap();
+        let idct_size = crate::idct::choose_idct_size(frame.image_size, Dimensions{ width: requested_width, height: requested_height });
+        frame.update_idct_size(idct_size);
+        Ok((frame.output_size.width, frame.output_size.height))
+    }
+
     /// Decodes the image and returns the decoded pixels if successful.
     pub fn decode(&mut self) -> Result<Vec<u8>> {
         self.decode_internal(false)
@@ -329,7 +346,7 @@ impl<R: Read> Decoder<R> {
         }
 
         let frame = self.frame.as_ref().unwrap();
-        compute_image(&frame.components, &planes, frame.image_size, self.is_jfif, self.color_transform)
+        compute_image(&frame.components, &planes, frame.output_size, self.is_jfif, self.color_transform)
     }
 
     fn read_marker(&mut self) -> Result<Marker> {
@@ -435,7 +452,7 @@ impl<R: Read> Decoder<R> {
                             let x = (block_num % blocks_per_row) as u16;
                             let y = (block_num / blocks_per_row) as u16;
 
-                            if x * 8 >= component.size.width || y * 8 >= component.size.height {
+                            if x * component.dct_scale as u16 >= component.size.width || y * component.dct_scale as u16 >= component.size.height {
                                 continue;
                             }
 
@@ -764,12 +781,15 @@ fn compute_image(components: &[Component],
             return Ok(data[0].clone())
         }
 
-        let mut buffer = vec![0u8; component.size.width as usize * component.size.height as usize];
-        let line_stride = component.block_size.width as usize * 8;
+        let width = component.size.width as usize;
+        let height = component.size.height as usize;
+
+        let mut buffer = vec![0u8; width * height];
+        let line_stride = width * component.dct_scale;
 
-        for y in 0 .. component.size.height as usize {
-            for x in 0 .. component.size.width as usize {
-                buffer[y * component.size.width as usize + x] = data[0][y * line_stride + x];
+        for y in 0 .. width {
+            for x in 0 .. height {
+                buffer[y * width + x] = data[0][y * line_stride + x];
             }
         }
 
diff --git a/src/idct.rs b/src/idct.rs
index 374ee7a6..7d12fca0 100644
--- a/src/idct.rs
+++ b/src/idct.rs
@@ -1,9 +1,51 @@
 // Malicious JPEG files can cause operations in the idct to overflow.
 // One example is tests/crashtest/images/imagetestsuite/b0b8914cc5f7a6eff409f16d8cc236c5.jpg
 // That's why wrapping operators are needed.
+use crate::parser::Dimensions;
+
+pub(crate) fn choose_idct_size(full_size: Dimensions, requested_size: Dimensions) -> usize {
+    fn scaled(len: u16, scale: usize) -> u16 { ((len as u32 * scale as u32 - 1) / 8 + 1) as u16 }
+
+    for &scale in &[1, 2, 4] {
+        if scaled(full_size.width, scale) >= requested_size.width || scaled(full_size.height, scale) >= requested_size.height {
+            return scale;
+        }
+    }
+
+    return 8;
+}
+
+#[test]
+fn test_choose_idct_size() {
+    assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 200, height: 200}), 1);
+    assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 500, height: 500}), 1);
+    assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 684, height: 456}), 1);
+    assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 999, height: 456}), 1);
+    assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 684, height: 999}), 1);
+    assert_eq!(choose_idct_size(Dimensions{width: 500, height: 333}, Dimensions{width: 63, height: 42}), 1);
+
+    assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 685, height: 999}), 2);
+    assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 1000, height: 1000}), 2);
+    assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 1400, height: 1400}), 4);
+    
+    assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 5472, height: 3648}), 8);
+    assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 16384, height: 16384}), 8);
+    assert_eq!(choose_idct_size(Dimensions{width: 1, height: 1}, Dimensions{width: 65535, height: 65535}), 8);
+    assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 16384, height: 16384}), 8);
+}
+
+pub(crate) fn dequantize_and_idct_block(scale: usize, coefficients: &[i16], quantization_table: &[u16; 64], output_linestride: usize, output: &mut [u8]) {
+    match scale {
+        8 => dequantize_and_idct_block_8x8(coefficients, quantization_table, output_linestride, output),
+        4 => dequantize_and_idct_block_4x4(coefficients, quantization_table, output_linestride, output),
+        2 => dequantize_and_idct_block_2x2(coefficients, quantization_table, output_linestride, output),
+        1 => dequantize_and_idct_block_1x1(coefficients, quantization_table, output_linestride, output),
+        _ => panic!("Unsupported IDCT scale {}/8", scale),
+    }
+}
 
 // This is based on stb_image's 'stbi__idct_block'.
-pub fn dequantize_and_idct_block(coefficients: &[i16], quantization_table: &[u16; 64], output_linestride: usize, output: &mut [u8]) {
+fn dequantize_and_idct_block_8x8(coefficients: &[i16], quantization_table: &[u16; 64], output_linestride: usize, output: &mut [u8]) {
     debug_assert_eq!(coefficients.len(), 64);
 
     let mut temp = [0i32; 64];
@@ -155,6 +197,103 @@ pub fn dequantize_and_idct_block(coefficients: &[i16], quantization_table: &[u16
     }
 }
 
+// 4x4 and 2x2 IDCT based on Rakesh Dugad and Narendra Ahuja: "A Fast Scheme for Image Size Change in the Compressed Domain" (2001).
+// http://sylvana.net/jpegcrop/jidctred/
+fn dequantize_and_idct_block_4x4(coefficients: &[i16], quantization_table: &[u16; 64], output_linestride: usize, output: &mut [u8]) {
+    debug_assert_eq!(coefficients.len(), 64);
+    let mut temp = [0i32; 4*4];
+
+    const CONST_BITS: u32 = 12;
+    const PASS1_BITS: u32 = 2;
+    const FINAL_BITS: u32 = CONST_BITS + PASS1_BITS + 3;
+
+    // columns
+    for i in 0 .. 4 {
+        let s0 = coefficients[i + 8*0] as i32 * quantization_table[i + 8*0] as i32;
+        let s1 = coefficients[i + 8*1] as i32 * quantization_table[i + 8*1] as i32;
+        let s2 = coefficients[i + 8*2] as i32 * quantization_table[i + 8*2] as i32;
+        let s3 = coefficients[i + 8*3] as i32 * quantization_table[i + 8*3] as i32;
+    
+        let x0 = s0.wrapping_add(s2).wrapping_shl(PASS1_BITS);
+        let x2 = s0.wrapping_sub(s2).wrapping_shl(PASS1_BITS);
+
+        let p1 = s1.wrapping_add(s3).wrapping_mul(stbi_f2f(0.541196100));
+        let t0 = p1.wrapping_add(s3.wrapping_mul(stbi_f2f(-1.847759065))).wrapping_add(512).wrapping_shr(CONST_BITS - PASS1_BITS);
+        let t2 = p1.wrapping_add(s1.wrapping_mul(stbi_f2f( 0.765366865))).wrapping_add(512).wrapping_shr(CONST_BITS - PASS1_BITS);
+
+        temp[i + 4*0] = x0.wrapping_add(t2);
+        temp[i + 4*3] = x0.wrapping_sub(t2);
+        temp[i + 4*1] = x2.wrapping_add(t0);
+        temp[i + 4*2] = x2.wrapping_sub(t0);
+    }
+
+    for i in 0 .. 4 {
+        let s0 = temp[i * 4 + 0];
+        let s1 = temp[i * 4 + 1];
+        let s2 = temp[i * 4 + 2];
+        let s3 = temp[i * 4 + 3];
+
+        let x0 = s0.wrapping_add(s2).wrapping_shl(CONST_BITS);
+        let x2 = s0.wrapping_sub(s2).wrapping_shl(CONST_BITS);
+
+        let p1 = s1.wrapping_add(s3).wrapping_mul(stbi_f2f(0.541196100));
+        let t0 = p1.wrapping_add(s3.wrapping_mul(stbi_f2f(-1.847759065)));
+        let t2 = p1.wrapping_add(s1.wrapping_mul(stbi_f2f(0.765366865)));
+
+        // constants scaled things up by 1<<12, plus we had 1<<2 from first
+        // loop, plus horizontal and vertical each scale by sqrt(8) so together
+        // we've got an extra 1<<3, so 1<<17 total we need to remove.
+        // so we want to round that, which means adding 0.5 * 1<<17,
+        // aka 65536. Also, we'll end up with -128 to 127 that we want
+        // to encode as 0..255 by adding 128, so we'll add that before the shift
+        let x0 = x0.wrapping_add((1 << (FINAL_BITS - 1)) + (128 << FINAL_BITS));
+        let x2 = x2.wrapping_add((1 << (FINAL_BITS - 1)) + (128 << FINAL_BITS));
+
+        output[i * output_linestride + 0] = stbi_clamp(x0.wrapping_add(t2).wrapping_shr(FINAL_BITS));
+        output[i * output_linestride + 3] = stbi_clamp(x0.wrapping_sub(t2).wrapping_shr(FINAL_BITS));
+        output[i * output_linestride + 1] = stbi_clamp(x2.wrapping_add(t0).wrapping_shr(FINAL_BITS));
+        output[i * output_linestride + 2] = stbi_clamp(x2.wrapping_sub(t0).wrapping_shr(FINAL_BITS));
+    }
+}
+
+fn dequantize_and_idct_block_2x2(coefficients: &[i16], quantization_table: &[u16; 64], output_linestride: usize, output: &mut [u8]) {
+    debug_assert_eq!(coefficients.len(), 64);
+
+    const SCALE_BITS: u32 = 3;
+
+    // Column 0
+    let s00 = coefficients[8*0] as i32 * quantization_table[8*0] as i32;
+    let s10 = coefficients[8*1] as i32 * quantization_table[8*1] as i32;
+
+    let x0 = s00.wrapping_add(s10);
+    let x2 = s00.wrapping_sub(s10);
+
+    // Column 1
+    let s01 = coefficients[8*0+1] as i32 * quantization_table[8*0+1] as i32;
+    let s11 = coefficients[8*1+1] as i32 * quantization_table[8*1+1] as i32;
+
+    let x1 = s01.wrapping_add(s11);
+    let x3 = s01.wrapping_sub(s11);
+
+    let x0 = x0.wrapping_add((1 << (SCALE_BITS-1)) + (128 << SCALE_BITS));
+    let x2 = x2.wrapping_add((1 << (SCALE_BITS-1)) + (128 << SCALE_BITS));
+
+    // Row 0
+    output[0] = stbi_clamp(x0.wrapping_add(x1).wrapping_shr(SCALE_BITS));
+    output[1] = stbi_clamp(x0.wrapping_sub(x1).wrapping_shr(SCALE_BITS));
+
+    // Row 1
+    output[output_linestride + 0] = stbi_clamp(x2.wrapping_add(x3).wrapping_shr(SCALE_BITS));
+    output[output_linestride + 1] = stbi_clamp(x2.wrapping_sub(x3).wrapping_shr(SCALE_BITS));
+}
+
+fn dequantize_and_idct_block_1x1(coefficients: &[i16], quantization_table: &[u16; 64], _output_linestride: usize, output: &mut [u8]) {
+    debug_assert_eq!(coefficients.len(), 64);
+
+    let s0 = (coefficients[0] as i32 * quantization_table[0] as i32).wrapping_add(128 * 8) / 8;
+    output[0] = stbi_clamp(s0);
+}
+
 // take a -128..127 value and stbi__clamp it and convert to 0..255
 fn stbi_clamp(x: i32) -> u8
 {
diff --git a/src/parser.rs b/src/parser.rs
index 039a75c8..4c28b9e9 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -34,6 +34,7 @@ pub struct FrameInfo {
     pub precision: u8,
 
     pub image_size: Dimensions,
+    pub output_size: Dimensions,
     pub mcu_size: Dimensions,
     pub components: Vec<Component>,
 }
@@ -58,6 +59,8 @@ pub struct Component {
 
     pub quantization_table_index: usize,
 
+    pub dct_scale: usize,
+
     pub size: Dimensions,
     pub block_size: Dimensions,
 }
@@ -79,6 +82,21 @@ pub enum AdobeColorTransform {
     YCCK,
 }
 
+impl FrameInfo {
+    pub(crate) fn update_idct_size(&mut self, idct_size: usize) {
+        for component in &mut self.components {
+            component.dct_scale = idct_size;
+        }
+
+        update_component_sizes(self.image_size, &mut self.components);
+
+        self.output_size = Dimensions {
+            width: (self.image_size.width as f32 * idct_size as f32 / 8.0).ceil() as u16,
+            height: (self.image_size.height as f32 * idct_size as f32 / 8.0).ceil() as u16
+        };
+    }
+}
+
 fn read_length<R: Read>(reader: &mut R, marker: Marker) -> Result<usize> {
     assert!(marker.has_length());
 
@@ -201,36 +219,45 @@ pub fn parse_sof<R: Read>(reader: &mut R, marker: Marker) -> Result<FrameInfo> {
             horizontal_sampling_factor: horizontal_sampling_factor,
             vertical_sampling_factor: vertical_sampling_factor,
             quantization_table_index: quantization_table_index as usize,
+            dct_scale: 8,
             size: Dimensions {width: 0, height: 0},
             block_size: Dimensions {width: 0, height: 0},
         });
     }
 
+    let mcu_size = update_component_sizes(Dimensions { width, height }, &mut components);
+
+    Ok(FrameInfo {
+        is_baseline: is_baseline,
+        is_differential: is_differential,
+        coding_process: coding_process,
+        entropy_coding: entropy_coding,
+        precision: precision,
+        image_size: Dimensions { width, height },
+        output_size: Dimensions { width, height },
+        mcu_size,
+        components: components,
+    })
+}
+
+fn update_component_sizes(size: Dimensions, components: &mut [Component]) -> Dimensions {
     let h_max = components.iter().map(|c| c.horizontal_sampling_factor).max().unwrap();
     let v_max = components.iter().map(|c| c.vertical_sampling_factor).max().unwrap();
+
     let mcu_size = Dimensions {
-        width: (width as f32 / (h_max as f32 * 8.0)).ceil() as u16,
-        height: (height as f32 / (v_max as f32 * 8.0)).ceil() as u16,
+        width: (size.width as f32 / (h_max as f32 * 8.0)).ceil() as u16,
+        height: (size.height as f32 / (v_max as f32 * 8.0)).ceil() as u16,
     };
 
-    for component in &mut components {
-        component.size.width = (width as f32 * (component.horizontal_sampling_factor as f32 / h_max as f32)).ceil() as u16;
-        component.size.height = (height as f32 * (component.vertical_sampling_factor as f32 / v_max as f32)).ceil() as u16;
+    for component in components {
+        component.size.width = (size.width as f32 * component.horizontal_sampling_factor as f32 * component.dct_scale as f32 / (h_max as f32 * 8.0)).ceil() as u16;
+        component.size.height = (size.height as f32 * component.vertical_sampling_factor as f32 * component.dct_scale as f32 / (v_max as f32 * 8.0)).ceil() as u16;
 
         component.block_size.width = mcu_size.width * component.horizontal_sampling_factor as u16;
         component.block_size.height = mcu_size.height * component.vertical_sampling_factor as u16;
     }
 
-    Ok(FrameInfo {
-        is_baseline: is_baseline,
-        is_differential: is_differential,
-        coding_process: coding_process,
-        entropy_coding: entropy_coding,
-        precision: precision,
-        image_size: Dimensions {width: width, height: height},
-        mcu_size: mcu_size,
-        components: components,
-    })
+    mcu_size
 }
 
 // Section B.2.3
diff --git a/src/upsampler.rs b/src/upsampler.rs
index 61b15c4f..31224a5c 100644
--- a/src/upsampler.rs
+++ b/src/upsampler.rs
@@ -29,7 +29,7 @@ impl Upsampler {
                 upsampler: upsampler,
                 width: component.size.width as usize,
                 height: component.size.height as usize,
-                row_stride: component.block_size.width as usize * 8,
+                row_stride: component.block_size.width as usize * component.dct_scale,
             });
         }
 
diff --git a/src/worker/immediate.rs b/src/worker/immediate.rs
index fd140485..7e512d2e 100644
--- a/src/worker/immediate.rs
+++ b/src/worker/immediate.rs
@@ -26,7 +26,7 @@ impl ImmediateWorker {
         assert!(self.results[data.index].is_empty());
 
         self.offsets[data.index] = 0;
-        self.results[data.index].resize(data.component.block_size.width as usize * data.component.block_size.height as usize * 64, 0u8);
+        self.results[data.index].resize(data.component.block_size.width as usize * data.component.block_size.height as usize * data.component.dct_scale * data.component.dct_scale, 0u8);
         self.components[data.index] = Some(data.component);
         self.quantization_tables[data.index] = Some(data.quantization_table);
     }
@@ -36,20 +36,21 @@ impl ImmediateWorker {
         let component = self.components[index].as_ref().unwrap();
         let quantization_table = self.quantization_tables[index].as_ref().unwrap();
         let block_count = component.block_size.width as usize * component.vertical_sampling_factor as usize;
-        let line_stride = component.block_size.width as usize * 8;
+        let line_stride = component.block_size.width as usize * component.dct_scale;
 
         assert_eq!(data.len(), block_count * 64);
 
         for i in 0..block_count {
-            let x = (i % component.block_size.width as usize) * 8;
-            let y = (i / component.block_size.width as usize) * 8;
-            dequantize_and_idct_block(&data[i * 64..(i + 1) * 64],
-                                    quantization_table,
-                                    line_stride,
-                                    &mut self.results[index][self.offsets[index] + y * line_stride + x..]);
+            let x = (i % component.block_size.width as usize) * component.dct_scale;
+            let y = (i / component.block_size.width as usize) * component.dct_scale;
+
+            let coefficients = &data[i * 64..(i + 1) * 64];
+            let output = &mut self.results[index][self.offsets[index] + y * line_stride + x..];
+
+            dequantize_and_idct_block(component.dct_scale, coefficients, quantization_table, line_stride, output);
         }
 
-        self.offsets[index] += data.len();
+        self.offsets[index] += block_count * component.dct_scale * component.dct_scale;
     }
     pub fn get_result_immediate(&mut self, index: usize) -> Vec<u8> {
         mem::replace(&mut self.results[index], Vec::new())
diff --git a/tests/reftest/images/rgb_125x84.png b/tests/reftest/images/rgb_125x84.png
new file mode 100644
index 00000000..c3bd2b7f
Binary files /dev/null and b/tests/reftest/images/rgb_125x84.png differ
diff --git a/tests/reftest/images/rgb_250x167.png b/tests/reftest/images/rgb_250x167.png
new file mode 100644
index 00000000..f008dada
Binary files /dev/null and b/tests/reftest/images/rgb_250x167.png differ
diff --git a/tests/reftest/images/rgb_63x42.png b/tests/reftest/images/rgb_63x42.png
new file mode 100644
index 00000000..a50b9238
Binary files /dev/null and b/tests/reftest/images/rgb_63x42.png differ
diff --git a/tests/reftest/mod.rs b/tests/reftest/mod.rs
index a35f5ff1..54f13709 100644
--- a/tests/reftest/mod.rs
+++ b/tests/reftest/mod.rs
@@ -15,9 +15,30 @@ fn reftest() {
     }
 }
 
+#[test]
+fn reftest_scaled() {
+    let base = &Path::new("tests").join("reftest").join("images");
+    reftest_scaled_file(&base.join("rgb.jpg"), 500, 333, &base.join("rgb.png"));
+    reftest_scaled_file(&base.join("rgb.jpg"), 250, 167, &base.join("rgb_250x167.png"));
+    reftest_scaled_file(&base.join("rgb.jpg"), 125, 84,  &base.join("rgb_125x84.png"));
+    reftest_scaled_file(&base.join("rgb.jpg"), 63,  42,  &base.join("rgb_63x42.png"));
+}
+
 fn reftest_file(path: &Path) {
+    let file = File::open(path).unwrap();
+    let decoder = jpeg::Decoder::new(file);
+    reftest_decoder(decoder, path, &path.with_extension("png"));
+}
+
+fn reftest_scaled_file(path: &Path, width: u16, height: u16, ref_path: &Path) {
     let file = File::open(path).unwrap();
     let mut decoder = jpeg::Decoder::new(file);
+    decoder.read_info().unwrap();
+    decoder.scale(width, height).unwrap();
+    reftest_decoder(decoder, path, &ref_path);
+}
+
+fn reftest_decoder<T: std::io::Read>(mut decoder: jpeg::Decoder<T>, path: &Path, ref_path: &Path) {
     let mut data = decoder.decode().expect(&format!("failed to decode file: {:?}", path));
     let info = decoder.info().unwrap();
     let mut pixel_format = info.pixel_format;
@@ -27,7 +48,7 @@ fn reftest_file(path: &Path) {
         pixel_format = jpeg::PixelFormat::RGB24;
     }
 
-    let ref_file = File::open(path.with_extension("png")).unwrap();
+    let ref_file = File::open(ref_path).unwrap();
     let (ref_info, mut ref_reader) = png::Decoder::new(ref_file).read_info().expect("png failed to read info");
 
     assert_eq!(ref_info.width, info.width as u32);