736
727
Some((nir, args, internal_args))
739
fn convert_spirv_to_nir(
730
pub(super) fn convert_spirv_to_nir(
731
build: &ProgramBuild,
742
args: Vec<spirv::SPIRVKernelArg>,
744
HashMap<Arc<Device>, NirShader>,
746
Vec<InternalKernelArg>,
749
let mut nirs = HashMap::new();
750
let mut args_set = HashSet::new();
751
let mut internal_args_set = HashSet::new();
752
let mut attributes_string_set = HashSet::new();
754
// TODO: we could run this in parallel?
755
for d in p.devs_with_build() {
756
let cache = d.screen().shader_cache();
757
let key = p.hash_key(d, name);
759
let res = if let Some(cache) = &cache {
760
cache.get(&mut key.unwrap()).and_then(|entry| {
761
let mut bin: &[u8] = &entry;
762
deserialize_nir(&mut bin, d)
768
let (nir, args, internal_args) = if let Some(res) = res {
771
let mut nir = p.to_nir(name, d);
773
/* this is a hack until we support fp16 properly and check for denorms inside
776
nir.preserve_fp16_denorms();
778
lower_and_optimize_nir_pre_inputs(d, &mut nir, &d.lib_clc);
779
let mut args = KernelArg::from_spirv_nir(&args, &mut nir);
780
let internal_args = lower_and_optimize_nir_late(d, &mut nir, &mut args);
782
if let Some(cache) = cache {
783
let mut bin = Vec::new();
784
let mut nir = nir.serialize();
786
bin.extend_from_slice(&nir.len().to_ne_bytes());
787
bin.append(&mut nir);
789
bin.extend_from_slice(&args.len().to_ne_bytes());
791
bin.append(&mut arg.serialize());
794
bin.extend_from_slice(&internal_args.len().to_ne_bytes());
795
for arg in &internal_args {
796
bin.append(&mut arg.serialize());
799
cache.put(&bin, &mut key.unwrap());
802
(nir, args, internal_args)
805
args_set.insert(args);
806
internal_args_set.insert(internal_args);
807
nirs.insert(d.clone(), nir);
808
attributes_string_set.insert(p.attribute_str(name, d));
733
args: &[spirv::SPIRVKernelArg],
735
) -> (NirShader, Vec<KernelArg>, Vec<InternalKernelArg>) {
736
let cache = dev.screen().shader_cache();
737
let key = build.hash_key(dev, name);
739
let res = if let Some(cache) = &cache {
740
cache.get(&mut key.unwrap()).and_then(|entry| {
741
let mut bin: &[u8] = &entry;
742
deserialize_nir(&mut bin, dev)
748
if let Some(res) = res {
751
let mut nir = build.to_nir(name, dev);
753
/* this is a hack until we support fp16 properly and check for denorms inside
756
nir.preserve_fp16_denorms();
758
lower_and_optimize_nir_pre_inputs(dev, &mut nir, &dev.lib_clc);
759
let mut args = KernelArg::from_spirv_nir(args, &mut nir);
760
let internal_args = lower_and_optimize_nir_late(dev, &mut nir, &mut args);
762
if let Some(cache) = cache {
763
let mut bin = Vec::new();
764
let mut nir = nir.serialize();
766
bin.extend_from_slice(&nir.len().to_ne_bytes());
767
bin.append(&mut nir);
769
bin.extend_from_slice(&args.len().to_ne_bytes());
771
bin.append(&mut arg.serialize());
774
bin.extend_from_slice(&internal_args.len().to_ne_bytes());
775
for arg in &internal_args {
776
bin.append(&mut arg.serialize());
779
cache.put(&bin, &mut key.unwrap());
782
(nir, args, internal_args)
811
// we want the same (internal) args for every compiled kernel, for now
812
assert!(args_set.len() == 1);
813
assert!(internal_args_set.len() == 1);
814
assert!(attributes_string_set.len() == 1);
815
let args = args_set.into_iter().next().unwrap();
816
let internal_args = internal_args_set.into_iter().next().unwrap();
818
// spec: For kernels not created from OpenCL C source and the clCreateProgramWithSource API call
819
// the string returned from this query [CL_KERNEL_ATTRIBUTES] will be empty.
820
let attributes_string = if p.is_src() {
821
attributes_string_set.into_iter().next().unwrap()
826
(nirs, args, internal_args, attributes_string)
829
786
fn extract<'a, const S: usize>(buf: &'a mut &[u8]) -> &'a [u8; S] {
838
pub fn new(name: String, prog: Arc<Program>, args: Vec<spirv::SPIRVKernelArg>) -> Arc<Kernel> {
839
let (mut nirs, args, internal_args, attributes_string) =
840
convert_spirv_to_nir(&prog, &name, args);
795
pub fn new(name: String, prog: Arc<Program>) -> Arc<Kernel> {
796
let nir_kernel_build = prog.get_nir_kernel_build(&name);
797
let nirs = &nir_kernel_build.nirs;
842
let nir = nirs.values_mut().next().unwrap();
799
let nir = nirs.values().next().unwrap();
843
800
let wgs = nir.workgroup_size();
844
801
let work_group_size = [wgs[0] as usize, wgs[1] as usize, wgs[2] as usize];
846
803
// can't use vec!...
847
let values = args.iter().map(|_| RefCell::new(None)).collect();
850
prog.kernel_count.fetch_add(1, Ordering::Relaxed);
804
let values = nir_kernel_build
807
.map(|_| RefCell::new(None))
853
811
base: CLObjectBase::new(),
857
814
work_group_size: work_group_size,
858
attributes_string: attributes_string,
815
subgroup_size: nir.subgroup_size() as usize,
816
num_subgroups: nir.num_subgroups() as usize,
860
internal_args: internal_args,
861
818
dev_state: KernelDevState::new(nirs),
819
build: nir_kernel_build,
1241
1201
self.dev_state.get(dev).info.preferred_simd_size as usize
1244
pub fn local_mem_size(&self, dev: &Arc<Device>) -> cl_ulong {
1204
pub fn local_mem_size(&self, dev: &Device) -> cl_ulong {
1245
1205
// TODO include args
1246
1206
self.dev_state.get(dev).nir.shared_size() as cl_ulong
1209
pub fn has_svm_devs(&self) -> bool {
1210
self.prog.devs.iter().any(|dev| dev.svm_supported())
1213
pub fn subgroup_sizes(&self, dev: &Device) -> Vec<usize> {
1214
SetBitIndices::from_msb(self.dev_state.get(dev).info.simd_sizes)
1215
.map(|bit| 1 << bit)
1219
pub fn subgroups_for_block(&self, dev: &Device, block: &[usize]) -> usize {
1220
let subgroup_size = self.subgroup_size_for_block(dev, block);
1221
if subgroup_size == 0 {
1225
let threads = block.iter().product();
1226
div_round_up(threads, subgroup_size)
1229
pub fn subgroup_size_for_block(&self, dev: &Device, block: &[usize]) -> usize {
1230
let subgroup_sizes = self.subgroup_sizes(dev);
1231
if subgroup_sizes.is_empty() {
1235
if subgroup_sizes.len() == 1 {
1236
return subgroup_sizes[0];
1240
*block.first().unwrap_or(&1) as u32,
1241
*block.get(1).unwrap_or(&1) as u32,
1242
*block.get(2).unwrap_or(&1) as u32,
1246
.compute_state_subgroup_size(self.dev_state.get(dev).cso, &block) as usize
1250
1250
impl Clone for Kernel {