huggingface · LaurentMazare · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024
diff --git a/candle-core/src/quantized/cuda.rs b/candle-core/src/quantized/cuda.rs
@@ -34,7 +34,10 @@ fn ceil_div(p: usize, q: usize) -> usize {
 }
 
 fn pad(p: usize, q: usize) -> usize {
-    ceil_div(p, q) * q
+    // Overallocate by q rather than just padding by q as this should pad the last row
+    // and we don't have enough information here to know how many elements to add :(
+    // ceil_div(p, q) * q
+    p + q
 }
 
 fn quantize_q8_1(
@@ -439,7 +442,7 @@ impl QCudaStorage {
             }
             _ => crate::bail!("only f32 can be quantized"),
         };
-        let src_len = src.len();
+        let src_len = pad(src.len(), MATRIX_ROW_PADDING);
         let src = crate::Storage::Cpu(crate::CpuStorage::F32(src));
         let mut qcpu_storage = crate::Device::Cpu.qzeros(src_len, self.dtype)?;
         qcpu_storage.quantize(&src)?;

diff --git a/candle-core/src/quantized/utils.rs b/candle-core/src/quantized/utils.rs
@@ -18,7 +18,7 @@ pub(super) fn group_for_quantization<'a, 'b, T: super::k_quants::GgmlType>(
     let actual_blocks = ys.len();
 
     // Validate that the input is the right size
-    if expected_blocks != actual_blocks {
+    if actual_blocks < expected_blocks {
         crate::bail!("quantize {dtype:?}: expected {expected_blocks} blocks but only {actual_blocks} were provided!")
     }
 

diff --git a/candle-examples/examples/flux/README.md b/candle-examples/examples/flux/README.md
@@ -13,7 +13,7 @@ descriptions,
 
 ```bash
 cargo run --features cuda --example flux -r -- \
-    --height 1024 --width 1024
+    --height 1024 --width 1024 \
     --prompt "a rusty robot walking on a beach holding a small torch, the robot has the word "rust" written on it, high quality, 4k"
 ```
 
diff --git a/candle-examples/examples/flux/main.rs b/candle-examples/examples/flux/main.rs
@@ -23,6 +23,10 @@ struct Args {
     #[arg(long)]
     cpu: bool,
 
+    /// Use the quantized model.
+    #[arg(long)]
+    quantized: bool,
+
     /// Enable tracing (generates a trace-timestamp.json file).
     #[arg(long)]
     tracing: bool,
@@ -60,6 +64,7 @@ fn run(args: Args) -> Result<()> {
         tracing,
         decode_only,
         model,
+        quantized,
     } = args;
     let width = width.unwrap_or(1360);
     let height = height.unwrap_or(768);
@@ -146,38 +151,71 @@ fn run(args: Args) -> Result<()> {
             };
             println!("CLIP\n{clip_emb}");
             let img = {
-                let model_file = match model {
-                    Model::Schnell => bf_repo.get("flux1-schnell.safetensors")?,
-                    Model::Dev => bf_repo.get("flux1-dev.safetensors")?,
-                };
-                let vb =
-                    unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], dtype, &device)? };
                 let cfg = match model {
                     Model::Dev => flux::model::Config::dev(),
                     Model::Schnell => flux::model::Config::schnell(),
                 };
                 let img = flux::sampling::get_noise(1, height, width, &device)?.to_dtype(dtype)?;
-                let state = flux::sampling::State::new(&t5_emb, &clip_emb, &img)?;
+                let state = if quantized {
+                    flux::sampling::State::new(
+                        &t5_emb.to_dtype(candle::DType::F32)?,
+                        &clip_emb.to_dtype(candle::DType::F32)?,
+                        &img.to_dtype(candle::DType::F32)?,
+                    )?
+                } else {
+                    flux::sampling::State::new(&t5_emb, &clip_emb, &img)?
+                };
                 let timesteps = match model {
                     Model::Dev => {
                         flux::sampling::get_schedule(50, Some((state.img.dim(1)?, 0.5, 1.15)))
                     }
                     Model::Schnell => flux::sampling::get_schedule(4, None),
                 };
-                let model = flux::model::Flux::new(&cfg, vb)?;
-
                 println!("{state:?}");
                 println!("{timesteps:?}");
-                flux::sampling::denoise(
-                    &model,
-                    &state.img,
-                    &state.img_ids,
-                    &state.txt,
-                    &state.txt_ids,
-                    &state.vec,
-                    &timesteps,
-                    4.,
-                )?
+                if quantized {
+                    let model_file = match model {
+                        Model::Schnell => api
+                            .repo(hf_hub::Repo::model("lmz/candle-flux".to_string()))
+                            .get("flux1-schnell.gguf")?,
+                        Model::Dev => todo!(),
+                    };
+                    let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(
+                        model_file, &device,
+                    )?;
+
+                    let model = flux::quantized_model::Flux::new(&cfg, vb)?;
+                    flux::sampling::denoise(
+                        &model,
+                        &state.img,
+                        &state.img_ids,
+                        &state.txt,
+                        &state.txt_ids,
+                        &state.vec,
+                        &timesteps,
+                        4.,
+                    )?
+                    .to_dtype(dtype)?
+                } else {
+                    let model_file = match model {
+                        Model::Schnell => bf_repo.get("flux1-schnell.safetensors")?,
+                        Model::Dev => bf_repo.get("flux1-dev.safetensors")?,
+                    };
+                    let vb = unsafe {
+                        VarBuilder::from_mmaped_safetensors(&[model_file], dtype, &device)?
+                    };
+                    let model = flux::model::Flux::new(&cfg, vb)?;
+                    flux::sampling::denoise(
+                        &model,
+                        &state.img,
+                        &state.img_ids,
+                        &state.txt,
+                        &state.txt_ids,
+                        &state.vec,
+                        &timesteps,
+                        4.,
+                    )?
+                }
             };
             flux::sampling::unpack(&img, height, width)?
         }

diff --git a/candle-transformers/src/models/flux/mod.rs b/candle-transformers/src/models/flux/mod.rs
@@ -1,3 +1,20 @@
+use candle::{Result, Tensor};
+
+pub trait WithForward {
+    #[allow(clippy::too_many_arguments)]
+    fn forward(
+        &self,
+        img: &Tensor,
+        img_ids: &Tensor,
+        txt: &Tensor,
+        txt_ids: &Tensor,
+        timesteps: &Tensor,
+        y: &Tensor,
+        guidance: Option<&Tensor>,
+    ) -> Result<Tensor>;
+}
+
 pub mod autoencoder;
 pub mod model;
+pub mod quantized_model;
 pub mod sampling;
diff --git a/candle-transformers/src/models/flux/model.rs b/candle-transformers/src/models/flux/model.rs
@@ -109,14 +109,14 @@ fn apply_rope(x: &Tensor, freq_cis: &Tensor) -> Result<Tensor> {
     (fr0.broadcast_mul(&x0)? + fr1.broadcast_mul(&x1)?)?.reshape(dims.to_vec())
 }
 
-fn attention(q: &Tensor, k: &Tensor, v: &Tensor, pe: &Tensor) -> Result<Tensor> {
+pub(crate) fn attention(q: &Tensor, k: &Tensor, v: &Tensor, pe: &Tensor) -> Result<Tensor> {
     let q = apply_rope(q, pe)?.contiguous()?;
     let k = apply_rope(k, pe)?.contiguous()?;
     let x = scaled_dot_product_attention(&q, &k, v)?;
     x.transpose(1, 2)?.flatten_from(2)
 }
 
-fn timestep_embedding(t: &Tensor, dim: usize, dtype: DType) -> Result<Tensor> {
+pub(crate) fn timestep_embedding(t: &Tensor, dim: usize, dtype: DType) -> Result<Tensor> {
     const TIME_FACTOR: f64 = 1000.;
     const MAX_PERIOD: f64 = 10000.;
     if dim % 2 == 1 {
@@ -144,7 +144,7 @@ pub struct EmbedNd {
 }
 
 impl EmbedNd {
-    fn new(dim: usize, theta: usize, axes_dim: Vec<usize>) -> Self {
+    pub fn new(dim: usize, theta: usize, axes_dim: Vec<usize>) -> Self {
         Self {
             dim,
             theta,
@@ -575,9 +575,11 @@ impl Flux {
             final_layer,
         })
     }
+}
 
+impl super::WithForward for Flux {
     #[allow(clippy::too_many_arguments)]
-    pub fn forward(
+    fn forward(
         &self,
         img: &Tensor,
         img_ids: &Tensor,