Cannot download CIFAR-10 #2632

VillSnow · 2024-11-19T13:42:46Z

fn main() -> Result<(), Box<dyn Error>> {
    let cifar = candle_datasets::vision::cifar::load()?;

    dbg!(&cifar.train_images); // Tensor[dims 50000, 3, 32, 32; u8]
    dbg!(&cifar.train_labels); // Tensor[dims 50000; u8]

    dbg!(cifar
        .train_images
        .flatten_all()?
        .to_vec1::<u8>()?
        .into_iter()
        .any(|x| x != 0)); // false

    dbg!(cifar
        .train_labels
        .flatten_all()?
        .to_vec1::<u8>()?
        .into_iter()
        .any(|x| x != 0)); // true

    Ok(())
}

I was able to see download progress bars for plain_text/test/0000.parquet and lain_text/train/0000.parquet, but no for images.

The text was updated successfully, but these errors were encountered:

BerserkerMother · 2024-11-20T21:15:22Z

what is the problem here?

cschin · 2024-12-09T16:58:20Z

There are two issues in the 0.8.0 codebase for downloading CIFAR-10:

The data in the Parquet file, when read through image::load_from_memory, has the shape (W, H, C). However, the code converts the image data to (C, W, H).
The data in the Parquet file is of type u8 with values ranging from 0 to 255. It needs to be cast to f32 before being divided by 255.0_f32.

Here is a modification that works with the latest 0.8.0 code.
(I am not sure if Candle has something similar to PyTorch's moviedom, so I have do the shape change with a Vec.)

diff --git a/candle-datasets/src/vision/cifar.rs b/candle-datasets/src/vision/cifar.rs
index 4b403a2e..71b2e735 100644
--- a/candle-datasets/src/vision/cifar.rs
+++ b/candle-datasets/src/vision/cifar.rs
@@ -67,13 +67,20 @@ fn load_parquet(parquet: SerializedFileReader<std::fs::File>) -> Result<(Tensor,
     let samples = parquet.metadata().file_metadata().num_rows() as usize;
     let mut buffer_images: Vec<u8> = Vec::with_capacity(samples * 1_024);
     let mut buffer_labels: Vec<u8> = Vec::with_capacity(samples);
+    let mut rgb_buffer: Vec<u8> = vec![0; 3 * W * H];
     for row in parquet.into_iter().flatten() {
         for (_name, field) in row.get_column_iter() {
             if let parquet::record::Field::Group(subrow) = field {
                 for (_name, field) in subrow.get_column_iter() {
                     if let parquet::record::Field::Bytes(value) = field {
-                        let image = image::load_from_memory(value.data()).unwrap();
-                        buffer_images.extend(image.to_rgb8().as_raw());
+                        let image = image::load_from_memory(value.data()).unwrap().to_rgb8();
+                        let raw_image = image.as_raw();
+                        raw_image.chunks(3).into_iter().enumerate().for_each( |(idx,v)| {
+                            rgb_buffer[idx] = v[0];
+                            rgb_buffer[W*H + idx] = v[1];
+                            rgb_buffer[2*W*H + idx] = v[2];
+                        } );
+                        buffer_images.extend(rgb_buffer.clone());
                     }
                 }
             } else if let parquet::record::Field::Long(label) = field {
@@ -82,7 +89,7 @@ fn load_parquet(parquet: SerializedFileReader<std::fs::File>) -> Result<(Tensor,
         }
     }
     let images = (Tensor::from_vec(buffer_images, (samples, 3, 32, 32), &Device::Cpu)?
-        .to_dtype(DType::U8)?
+        .to_dtype(DType::F32)?
         / 255.)?;
     let labels = Tensor::from_vec(buffer_labels, (samples,), &Device::Cpu)?;
     Ok((images, labels))

EricLBuehler · 2024-12-09T17:12:13Z

@cschin Candle has Tensor::permute which should be able to do this, I think:

I am not sure if Candle has something similar to PyTorch's moviedom, so I have do the shape change with a Vec.

For converting a tensor of shape (W, H, C) to (C, W, H), you could do: x.permute(2,0,1).

cschin · 2024-12-09T17:23:47Z

@EricLBuehler thanks. this makes the patch cleaner

--- a/candle-datasets/src/vision/cifar.rs
+++ b/candle-datasets/src/vision/cifar.rs
@@ -81,8 +81,9 @@ fn load_parquet(parquet: SerializedFileReader<std::fs::File>) -> Result<(Tensor,
             }
         }
     }
-    let images = (Tensor::from_vec(buffer_images, (samples, 3, 32, 32), &Device::Cpu)?
-        .to_dtype(DType::U8)?
+    let images = (Tensor::from_vec(buffer_images, (samples, 32, 32, 3), &Device::Cpu)?
+        .permute( (0, 3, 1, 2) )?
+        .to_dtype(DType::F32)?
         / 255.)?;
     let labels = Tensor::from_vec(buffer_labels, (samples,), &Device::Cpu)?;
     Ok((images, labels))

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Cannot download CIFAR-10 #2632

Cannot download CIFAR-10 #2632

VillSnow commented Nov 19, 2024

BerserkerMother commented Nov 20, 2024

cschin commented Dec 9, 2024

EricLBuehler commented Dec 9, 2024

cschin commented Dec 9, 2024

Cannot download CIFAR-10 #2632

Cannot download CIFAR-10 #2632

Comments

VillSnow commented Nov 19, 2024

BerserkerMother commented Nov 20, 2024

cschin commented Dec 9, 2024

EricLBuehler commented Dec 9, 2024

cschin commented Dec 9, 2024