candle tensor operations are bit slower than pytorch tensor operations

below code was used to test the candle operations 
```
let a = Tensor::rand(0f32, 1.0, (32,512,768), &candle_core::Device::Cpu)?;
let b = Tensor::rand(0f32, 1.0, (32,512,768), &candle_core::Device::Cpu)?;
let c = Tensor::new(&[1f32], &candle_core::Device::Cpu)?;
let d = b.to_dtype(candle_core::DType::I64)?;

let start = std::time::Instant::now();
for _ in 0..100{
    let _ = softmax(&a, 2);
}
println!("softmax : {:?}",std::time::Instant::now()-start);

let start = std::time::Instant::now();
for _ in 0..100{
    let _ = a.gelu_erf();
}
println!("Gelu : {:?}",std::time::Instant::now()-start);

let start = std::time::Instant::now();
for _ in 0..100{
    let _ = a.add(&b);
}
println!("add : {:?}",std::time::Instant::now()-start);

let start = std::time::Instant::now();
for _ in 0..100{
    let _ = a.broadcast_add(&c);
}
println!("broadcast add  : {:?}",std::time::Instant::now()-start);

let start = std::time::Instant::now();
for _ in 0..100{
    let _ = a.gather(&d,2);
}
println!("gather  : {:?}",std::time::Instant::now()-start);

println!("{:?}",a.shape());
```

and this are the results against the python operations

<img width="1384" alt="Screenshot 2024-03-24 at 1 17 22 AM" src="https://github.com/huggingface/candle/assets/121623449/5ac95f5b-46ea-4c5d-9187-c045a637e1f0">


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

candle tensor operations are bit slower than pytorch tensor operations #1926

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

candle tensor operations are bit slower than pytorch tensor operations #1926

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions