~mmach/netext73/mesa-haswell

(('idiv', a, '#b(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), '!options->lower_bitops'),

122

(('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'),

123

(('imod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'),

124

(('imod', a, '#b(is_neg_power_of_two)'), ('bcsel', ('ieq', ('ior', a, b), b), 0, ('ior', a, b)), '!options->lower_bitops'),

125

# 'irem(a, b)' -> 'a - ((a < 0 ? (a + b - 1) : a) & -b)'

126

(('irem', a, '#b(is_pos_power_of_two)'),

127

('isub', a, ('iand', ('bcsel', ('ilt', a, 0), ('iadd', a, ('isub', b, 1)), a), ('ineg', b))),

128

'!options->lower_bitops'),

129

(('irem', a, '#b(is_neg_power_of_two)'), ('irem', a, ('iabs', b)), '!options->lower_bitops'),

130

131

(('~fneg', ('fneg', a)), a),

132

(('ineg', ('ineg', a)), a),

133

(('fabs', ('fneg', a)), ('fabs', a)),

134

(('fabs', ('u2f', a)), ('u2f', a)),

135

(('iabs', ('iabs', a)), ('iabs', a)),

136

(('iabs', ('ineg', a)), ('iabs', a)),

137

(('f2b', ('fneg', a)), ('f2b', a)),

138

(('i2b', ('ineg', a)), ('i2b', a)),

139

(('~fadd', a, 0.0), a),

140

# a+0.0 is 'a' unless 'a' is denormal or -0.0. If it's only used by a

141

# floating point instruction, they should flush any input denormals and we

142

# can replace -0.0 with 0.0 if the float execution mode allows it.

143

(('fadd(is_only_used_as_float)', 'a@16', 0.0), a, '!'+signed_zero_inf_nan_preserve_16),

144

(('fadd(is_only_used_as_float)', 'a@32', 0.0), a, '!'+signed_zero_inf_nan_preserve_32),

145

(('iadd', a, 0), a),

146

(('usadd_4x8_vc4', a, 0), a),

147

(('usadd_4x8_vc4', a, ~0), ~0),

148

(('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),

149

(('~fadd', ('fmulz', a, b), ('fmulz', a, c)), ('fmulz', a, ('fadd', b, c))),

150

(('~ffma', a, b, ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)),

151

(('~ffma', a, b, ('fmul(is_used_once)', a, c)), ('fmul', a, ('fadd', b, c))),

152

(('~fadd', ('fmul(is_used_once)', a, b), ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)),

153

(('~ffma', a, ('fmul(is_used_once)', b, c), ('fmul(is_used_once)', b, d)), ('fmul', b, ('ffma', a, c, d))),

154

(('~ffmaz', a, b, ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)),

155

(('~ffmaz', a, b, ('fmulz(is_used_once)', a, c)), ('fmulz', a, ('fadd', b, c))),

156

(('~fadd', ('fmulz(is_used_once)', a, b), ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)),

157

(('~ffmaz', a, ('fmulz(is_used_once)', b, c), ('fmulz(is_used_once)', b, d)), ('fmulz', b, ('ffmaz', a, c, d))),

158

(('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),

159

(('iand', ('ior', a, b), ('ior', a, c)), ('ior', a, ('iand', b, c))),

160

(('ior', ('iand', a, b), ('iand', a, c)), ('iand', a, ('ior', b, c))),

161

(('~fadd', ('fneg', a), a), 0.0),

162

(('iadd', ('ineg', a), a), 0),

163

(('iadd', ('ineg', a), ('iadd', a, b)), b),

164

(('iadd', a, ('iadd', ('ineg', a), b)), b),

165

(('~fadd', ('fneg', a), ('fadd', a, b)), b),

166

(('~fadd', a, ('fadd', ('fneg', a), b)), b),

167

(('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))),

168

(('~fmul', a, 0.0), 0.0),

169

# The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN

170

(('fmul', 'a@16', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_16),

171

(('fmul', 'a@32', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_32),

172

(('fmulz', a, 0.0), 0.0),

173

(('fmulz', a, 'b(is_finite_not_zero)'), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_32),

174

(('fmulz', 'a(is_finite)', 'b(is_finite)'), ('fmul', a, b)),

175

(('fmulz', a, a), ('fmul', a, a)),

176

(('ffmaz', a, 'b(is_finite_not_zero)', c), ('ffma', a, b, c), '!'+signed_zero_inf_nan_preserve_32),

177

(('ffmaz', 'a(is_finite)', 'b(is_finite)', c), ('ffma', a, b, c)),

178

(('ffmaz', a, a, b), ('ffma', a, a, b)),

179

(('imul', a, 0), 0),

180

(('umul_unorm_4x8_vc4', a, 0), 0),

181

(('umul_unorm_4x8_vc4', a, ~0), a),

182

(('~fmul', a, 1.0), a),

183

(('~fmulz', a, 1.0), a),

184

# The only effect a*1.0 can have is flushing denormals. If it's only used by

185

# a floating point instruction, they should flush any input denormals and

186

# this multiplication isn't needed.

187

(('fmul(is_only_used_as_float)', a, 1.0), a),

188

(('imul', a, 1), a),

189

(('fmul', a, -1.0), ('fneg', a)),

190

(('imul', a, -1), ('ineg', a)),

191

# If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a

192

# If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a

193

# If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0

194

# If a != a: fsign(a)*a*a => 0*NaN*NaN => abs(NaN)*NaN

195

(('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)),

196

(('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)),

197

(('~ffma', 0.0, a, b), b),

198

(('ffma@16(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_16),

199

(('ffma@32(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_32),

200

(('ffmaz', 0.0, a, b), ('fadd', 0.0, b)),

201

(('~ffma', a, b, 0.0), ('fmul', a, b)),

202

(('ffma@16', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_16),

203

(('ffma@32', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_32),

204

(('ffmaz', a, b, 0.0), ('fmulz', a, b), '!'+signed_zero_inf_nan_preserve_32),

205

(('ffma', 1.0, a, b), ('fadd', a, b)),

206

(('ffmaz', 1.0, a, b), ('fadd', a, b), '!'+signed_zero_inf_nan_preserve_32),

207

(('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)),

208

(('ffmaz', -1.0, a, b), ('fadd', ('fneg', a), b), '!'+signed_zero_inf_nan_preserve_32),

209

(('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)),

210

(('~ffmaz', '#a', '#b', c), ('fadd', ('fmulz', a, b), c)),

211

(('~flrp', a, b, 0.0), a),

212

(('~flrp', a, b, 1.0), b),

213

(('~flrp', a, a, b), a),

214

(('~flrp', 0.0, a, b), ('fmul', a, b)),

215

216

# flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c)

217

(('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)),

218

219

(('sdot_4x8_iadd', a, 0, b), b),

220

(('udot_4x8_uadd', a, 0, b), b),

221

(('sdot_4x8_iadd_sat', a, 0, b), b),

222

(('udot_4x8_uadd_sat', a, 0, b), b),

223

(('sdot_2x16_iadd', a, 0, b), b),

224

(('udot_2x16_uadd', a, 0, b), b),

225

(('sdot_2x16_iadd_sat', a, 0, b), b),

226

(('udot_2x16_uadd_sat', a, 0, b), b),

227

228

# sudot_4x8_iadd is not commutative at all, so the patterns must be

229

# duplicated with zeros on each of the first positions.

230

(('sudot_4x8_iadd', a, 0, b), b),

231

(('sudot_4x8_iadd', 0, a, b), b),

232

(('sudot_4x8_iadd_sat', a, 0, b), b),

233

(('sudot_4x8_iadd_sat', 0, a, b), b),

234

235

(('iadd', ('sdot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_4x8_iadd', a, b, ('iadd', c, d))),

236

(('iadd', ('udot_4x8_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_4x8_uadd', a, b, ('iadd', c, d))),

237

(('iadd', ('sudot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sudot_4x8_iadd', a, b, ('iadd', c, d))),

238

(('iadd', ('sdot_2x16_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_2x16_iadd', a, b, ('iadd', c, d))),

239

(('iadd', ('udot_2x16_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_2x16_uadd', a, b, ('iadd', c, d))),

240

241

# Try to let constant folding eliminate the dot-product part. These are

242

# safe because the dot product cannot overflow 32 bits.

243

(('iadd', ('sdot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sdot_4x8_iadd', a, b, c)),

244

(('iadd', ('udot_4x8_uadd', 'a(is_not_const)', b, 0), c), ('udot_4x8_uadd', a, b, c)),

245

(('iadd', ('sudot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sudot_4x8_iadd', a, b, c)),

246

(('iadd', ('sudot_4x8_iadd', a, 'b(is_not_const)', 0), c), ('sudot_4x8_iadd', a, b, c)),

247

(('iadd', ('sdot_2x16_iadd', 'a(is_not_const)', b, 0), c), ('sdot_2x16_iadd', a, b, c)),

248

(('iadd', ('udot_2x16_uadd', 'a(is_not_const)', b, 0), c), ('udot_2x16_uadd', a, b, c)),

249

(('sdot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_4x8_iadd', a, b, 0), c)),

250

(('udot_4x8_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_4x8_uadd', a, b, 0), c)),

251

(('sudot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sudot_4x8_iadd', a, b, 0), c)),

252

(('sdot_2x16_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_2x16_iadd', a, b, 0), c)),

253

(('udot_2x16_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_2x16_uadd', a, b, 0), c)),

254

(('sdot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'),

255

(('udot_4x8_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->lower_uadd_sat'),

256

(('sudot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'),

257

(('sdot_2x16_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_2x16_iadd', a, b, 0), c), '!options->lower_iadd_sat'),

258

(('udot_2x16_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_2x16_uadd', a, b, 0), c), '!options->lower_uadd_sat'),

259

260

# Optimize open-coded fmulz.

261

# (b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b) -> fmulz(a, b)

262

(('fmul@32', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, a), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, b)),

263

('fmulz', a, b), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32),

264

(('fmul@32', a, ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)')),

265

('fmulz', a, b), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32),

266

267

# ffma(b==0.0 ? 0.0 : a, a==0.0 ? 0.0 : b, c) -> ffmaz(a, b, c)

268

(('ffma@32', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, a), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, b), c),

269

('ffmaz', a, b, c), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32),

270

(('ffma@32', a, ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)'), c),

271

('ffmaz', a, b, c), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32),

272

]

273

274

# Shorthand for the expansion of just the dot product part of the [iu]dp4a

275

# instructions.

276

sdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)),

277

('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))),

278

('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)),

279

('imul', ('extract_i8', a, 3), ('extract_i8', b, 3))))

280

udot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)),

281

('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))),

282

('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)),

283

('imul', ('extract_u8', a, 3), ('extract_u8', b, 3))))

284

sudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)),

285

('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))),

286

('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)),

287

('imul', ('extract_i8', a, 3), ('extract_u8', b, 3))))

288

sdot_2x16_a_b = ('iadd', ('imul', ('extract_i16', a, 0), ('extract_i16', b, 0)),

289

('imul', ('extract_i16', a, 1), ('extract_i16', b, 1)))

290

udot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)),

291

('imul', ('extract_u16', a, 1), ('extract_u16', b, 1)))

292

293

optimizations.extend([

294

(('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_sdot_4x8'),

295

(('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_udot_4x8'),

296

(('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),

297

(('sdot_2x16_iadd', a, b, c), ('iadd', sdot_2x16_a_b, c), '!options->has_dot_2x16'),

298

(('udot_2x16_uadd', a, b, c), ('iadd', udot_2x16_a_b, c), '!options->has_dot_2x16'),

299

300

# For the unsigned dot-product, the largest possible value 4*(255*255) =

301

# 0x3f804, so we don't have to worry about that intermediate result

302

# overflowing. 0x100000000 - 0x3f804 = 0xfffc07fc. If c is a constant

303

# that is less than 0xfffc07fc, then the result cannot overflow ever.

304

(('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)),

305

(('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', udot_4x8_a_b, c), '!options->has_udot_4x8'),

306

307

# For the signed dot-product, the largest positive value is 4*(-128*-128) =

308

# 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00. We

309

# don't have to worry about that intermediate result overflowing or

310

# underflowing.

311

(('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', sdot_4x8_a_b, c), '!options->has_sdot_4x8'),

312

313

(('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),

314

315

(('udot_2x16_uadd_sat', a, b, c), ('uadd_sat', udot_2x16_a_b, c), '!options->has_dot_2x16'),

316

(('sdot_2x16_iadd_sat', a, b, c), ('iadd_sat', sdot_2x16_a_b, c), '!options->has_dot_2x16'),

317

])

318

319

# Float sizes

320

for s in [16, 32, 64]:

321

optimizations.extend([

322

(('~flrp@{}'.format(s), a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),

323

324

(('~flrp@{}'.format(s), a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp{}'.format(s)),

325

(('~flrp@{}'.format(s), ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)),

326

(('~flrp@{}'.format(s), a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp{}'.format(s)),

327

328

(('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),

329

# These are the same as the previous three rules, but it depends on

330

# 1-fsat(x) <=> fsat(1-x). See below.

331

(('~fadd@{}'.format(s), ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c)))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp{}'.format(s)),

332

(('~fadd@{}'.format(s), a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),

333

334

(('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),

335

(('~fadd@{}'.format(s), a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),

336

337

(('~ffma@{}'.format(s), a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1'))), ('fmul', b, ('b2f', 'c@1'))), ('bcsel', c, b, a)),

338

(('~ffma@{}'.format(s), b, ('b2f', 'c@1'), ('ffma', ('fneg', a), ('b2f', 'c@1'), a)), ('bcsel', c, b, a)),

339

340

# These two aren't flrp lowerings, but do appear in some shaders.

341

(('~ffma@{}'.format(s), ('b2f', 'c@1'), ('fadd', b, ('fneg', a)), a), ('bcsel', c, b, a)),

342

(('~ffma@{}'.format(s), ('b2f', 'c@1'), ('ffma', ('fneg', a), b, d), ('fmul', a, b)), ('bcsel', c, d, ('fmul', a, b))),

343

344

# 1 - ((1 - a) * (1 - b))

345

# 1 - (1 - a - b + a*b)

346

# 1 - 1 + a + b - a*b

347

# a + b - a*b

348

# a + b*(1 - a)

349

# b*(1 - a) + 1*a

350

# flrp(b, 1, a)

351

(('~fadd@{}'.format(s), 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))), ('flrp', b, 1.0, a), '!options->lower_flrp{}'.format(s)),

352

])

353

354

optimizations.extend([

355

(('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)),

356

357

(('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)),

358

(('ftrunc', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'),

359

360

# Approximate handling of fround_even for DX9 addressing from gallium nine on

361

# DX9-class hardware with no proper fround support.

362

(('fround_even', a), ('bcsel',

363

('feq', ('ffract', a), 0.5),

364

('fadd', ('ffloor', ('fadd', a, 0.5)), 1.0),

365

('ffloor', ('fadd', a, 0.5))), 'options->lower_fround_even'),

366

367

(('ffloor', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),

368

(('fadd', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),

369

(('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),

370

(('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'),

371

(('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'),

372

(('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma32'),

373

(('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'),

374

(('ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->lower_ffma32'),

375

# Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late).

376

(('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'),

377

(('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'),

378

(('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'),

379

(('~ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->fuse_ffma32'),

380

381

(('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),

382

('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),

383

384

(('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'),

385

386

(('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d), '!options->lower_fdph'),

387

(('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)),

388

(('fdot4', ('vec4', a, b, 0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)),

389

(('fdot4', ('vec4', a, b, c, 0.0), d), ('fdot3', ('vec3', a, b, c), d)),

390

391

(('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)),

392

(('fdot3', ('vec3', a, b, 0.0), c), ('fdot2', ('vec2', a, b), c)),

393

394

(('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)),

395

(('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')),

396

397

# Lower fdot to fsum when it is available

398

(('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'),

399

(('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'),

400

(('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'),

401

(('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'),

402

403

# If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially

404

# If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1

405

# If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0

406

(('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),

407

408

# (a * #b + #c) << #d

409

# ((a * #b) << #d) + (#c << #d)

410

# (a * (#b << #d)) + (#c << #d)

411

(('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'),

412

('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))),

413

414

# (a * #b) << #c

415

# a * (#b << #c)

416

(('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))),

417

])

418

419

# Care must be taken here. Shifts in NIR uses only the lower log2(bitsize)

420

# bits of the second source. These replacements must correctly handle the

421

# case where (b % bitsize) + (c % bitsize) >= bitsize.

422

for s in [8, 16, 32, 64]:

423

mask = s - 1

424

425

ishl = "ishl@{}".format(s)

426

ishr = "ishr@{}".format(s)

427

ushr = "ushr@{}".format(s)

428

429

in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s)

430

431

optimizations.extend([

432

((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)),

433

((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)),

434

435

# To get get -1 for large shifts of negative values, ishr must instead

436

# clamp the shift count to the maximum value.

437

((ishr, (ishr, a, '#b'), '#c'),

438

(ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))),

439

])

440

441

# Optimize a pattern of address calculation created by DXVK where the offset is

442

# divided by 4 and then multipled by 4. This can be turned into an iand and the

443

# additions before can be reassociated to CSE the iand instruction.

444

445

for size, mask in ((8, 0xff), (16, 0xffff), (32, 0xffffffff), (64, 0xffffffffffffffff)):

446

a_sz = 'a@{}'.format(size)

447

448

optimizations.extend([

449

# 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)'

450

(('ishl', ('ushr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))),

451

(('ishl', ('ishr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))),

452

453

# This does not trivially work with ishr.

454

(('ushr', ('ishl', a_sz, '#b'), b), ('iand', a, ('ushr', mask, b))),

455

])

456

457

optimizations.extend([

458

(('iand', ('ishl', 'a@32', '#b(is_first_5_bits_uge_2)'), -4), ('ishl', a, b)),

459

(('iand', ('imul', a, '#b(is_unsigned_multiple_of_4)'), -4), ('imul', a, b)),

460

])

461

462

for log2 in range(1, 7): # powers of two from 2 to 64

463

v = 1 << log2

464

mask = 0xffffffff & ~(v - 1)

465

b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v)

466

467

optimizations.extend([

468

# Reassociate for improved CSE

469

(('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)),

470

])

471

472

# To save space in the state tables, reduce to the set that is known to help.

473

# Previously, this was range(1, 32). In addition, a couple rules inside the

474

# loop are commented out. Revisit someday, probably after mesa/#2635 has some

475

# resolution.

476

for i in [1, 2, 16, 24]:

477

lo_mask = 0xffffffff >> i

478

hi_mask = (0xffffffff << i) & 0xffffffff

479

480

optimizations.extend([

481

# This pattern seems to only help in the soft-fp64 code.

482

(('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)),

483

# (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)),

484

# (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)),

485

486

(('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)),

487

(('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)),

488

# (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct

489

])

490

491

optimizations.extend([

492

# This is common for address calculations. Reassociating may enable the

493

# 'a<<c' to be CSE'd. It also helps architectures that have an ISHLADD

494

# instruction or a constant offset field for in load / store instructions.

495

(('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))),

496

497

# (a + #b) * #c => (a * #c) + (#b * #c)

498

(('imul', ('iadd(is_used_once)', a, '#b'), '#c'), ('iadd', ('imul', a, c), ('imul', b, c))),

499

500

# ((a + #b) + c) * #d => ((a + c) * #d) + (#b * #d)

501

(('imul', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'),

502

('iadd', ('imul', ('iadd', a, c), d), ('imul', b, d))),

503

(('ishl', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'),

504

('iadd', ('ishl', ('iadd', a, c), d), ('ishl', b, d))),

505

506

# Comparison simplifications

507

(('inot', ('flt(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('fge', a, b)),

508

(('inot', ('fge(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('flt', a, b)),

509

(('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)),

510

(('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)),

511

(('inot', ('ilt(is_used_once)', a, b)), ('ige', a, b)),

512

(('inot', ('ult(is_used_once)', a, b)), ('uge', a, b)),

513

(('inot', ('ige(is_used_once)', a, b)), ('ilt', a, b)),

514

(('inot', ('uge(is_used_once)', a, b)), ('ult', a, b)),

515

(('inot', ('ieq(is_used_once)', a, b)), ('ine', a, b)),

516

(('inot', ('ine(is_used_once)', a, b)), ('ieq', a, b)),

517

518

(('iand', ('feq', a, b), ('fneu', a, b)), False),

519

(('iand', ('flt', a, b), ('flt', b, a)), False),

520

(('iand', ('ieq', a, b), ('ine', a, b)), False),

521

(('iand', ('ilt', a, b), ('ilt', b, a)), False),

522

(('iand', ('ult', a, b), ('ult', b, a)), False),

523

524

# This helps some shaders because, after some optimizations, they end up

525

# with patterns like (-a < -b) || (b < a). In an ideal world, this sort of

526

# matching would be handled by CSE.

527

(('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),

528

(('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),

529

(('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),

530

(('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),

531

(('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),

532

(('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),

533

(('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),

534

(('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),

535

(('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)),

536

(('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),

537

538

# b < fsat(NaN) -> b < 0 -> false, and b < Nan -> false.

539

(('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),

540

541

# fsat(NaN) >= b -> 0 >= b -> false, and NaN >= b -> false.

542

(('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),

543

544

# b == fsat(NaN) -> b == 0 -> false, and b == NaN -> false.

545

(('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),

546

547

# b != fsat(NaN) -> b != 0 -> true, and b != NaN -> true.

548

(('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),

549

550

# fsat(NaN) >= 1 -> 0 >= 1 -> false, and NaN >= 1 -> false.

551

(('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),

552

553

# 0 < fsat(NaN) -> 0 < 0 -> false, and 0 < NaN -> false.

554

(('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)),

555

556

# 0.0 >= b2f(a)

557

# b2f(a) <= 0.0

558

# b2f(a) == 0.0 because b2f(a) can only be 0 or 1

559

# inot(a)

560

(('fge', 0.0, ('b2f', 'a@1')), ('inot', a)),

561

562

(('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)),

563

564

(('fneu', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)),

565

(('fneu', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('ior', a, b)),

566

(('fneu', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('ior', a, b)),

567

(('fneu', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)),

568

(('fneu', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('iand', a, b)),

569

(('fneu', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)),

570

(('fneu', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ixor', a, b)),

571

(('fneu', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ixor', a, b)),

572

(('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))),

573

(('feq', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('inot', ('ior', a, b))),

574

(('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('inot', ('ior', a, b))),

575

(('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))),

576

(('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('inot', ('iand', a, b))),

577

(('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)),

578

(('feq', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ieq', a, b)),

579

(('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ieq', a, b)),

580

581

# -(b2f(a) + b2f(b)) < 0

582

# 0 < b2f(a) + b2f(b)

583

# 0 != b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative

584

# a || b

585

(('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)),

586

(('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)),

587

588

# -(b2f(a) + b2f(b)) >= 0

589

# 0 >= b2f(a) + b2f(b)

590

# 0 == b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative

591

# !(a || b)

592

(('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))),

593

(('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))),

594

595

(('flt', a, ('fneg', a)), ('flt', a, 0.0)),

596

(('fge', a, ('fneg', a)), ('fge', a, 0.0)),

597

598

# Some optimizations (below) convert things like (a < b || c < b) into

599

# (min(a, c) < b). However, this interfers with the previous optimizations

600

# that try to remove comparisons with negated sums of b2f. This just

601

# breaks that apart.

602

(('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0),

603

('ior', ('flt', c, 0.0), ('ior', a, b))),

604

605

(('~flt', ('fadd', a, b), a), ('flt', b, 0.0)),

606

(('~fge', ('fadd', a, b), a), ('fge', b, 0.0)),

607

(('~feq', ('fadd', a, b), a), ('feq', b, 0.0)),

608

(('~fneu', ('fadd', a, b), a), ('fneu', b, 0.0)),

609

(('~flt', ('fadd(is_used_once)', a, '#b'), '#c'), ('flt', a, ('fadd', c, ('fneg', b)))),

610

(('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)),

611

(('~fge', ('fadd(is_used_once)', a, '#b'), '#c'), ('fge', a, ('fadd', c, ('fneg', b)))),

612

(('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)),

613

(('~feq', ('fadd(is_used_once)', a, '#b'), '#c'), ('feq', a, ('fadd', c, ('fneg', b)))),

614

(('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)),

615

(('~fneu', ('fadd(is_used_once)', a, '#b'), '#c'), ('fneu', a, ('fadd', c, ('fneg', b)))),

616

(('~fneu', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fneu', ('fneg', ('fadd', c, b)), a)),

617

618

# Cannot remove the addition from ilt or ige due to overflow.

619

(('ieq', ('iadd', a, b), a), ('ieq', b, 0)),

620

(('ine', ('iadd', a, b), a), ('ine', b, 0)),

621

622

(('feq', ('b2f', 'a@1'), 0.0), ('inot', a)),

623

(('fneu', ('b2f', 'a@1'), 0.0), a),

624

(('ieq', ('b2i', 'a@1'), 0), ('inot', a)),

625

(('ine', ('b2i', 'a@1'), 0), a),

626

627

(('fneu', ('u2f', a), 0.0), ('ine', a, 0)),

628

(('feq', ('u2f', a), 0.0), ('ieq', a, 0)),

629

(('fge', ('u2f', a), 0.0), True),

630

(('fge', 0.0, ('u2f', a)), ('uge', 0, a)), # ieq instead?

631

(('flt', ('u2f', a), 0.0), False),

632

(('flt', 0.0, ('u2f', a)), ('ult', 0, a)), # ine instead?

633

(('fneu', ('i2f', a), 0.0), ('ine', a, 0)),

634

(('feq', ('i2f', a), 0.0), ('ieq', a, 0)),

635

(('fge', ('i2f', a), 0.0), ('ige', a, 0)),

636

(('fge', 0.0, ('i2f', a)), ('ige', 0, a)),

637

(('flt', ('i2f', a), 0.0), ('ilt', a, 0)),

638

(('flt', 0.0, ('i2f', a)), ('ilt', 0, a)),

639

640

# 0.0 < fabs(a)

641

# fabs(a) > 0.0

642

# fabs(a) != 0.0 because fabs(a) must be >= 0

643

# a != 0.0

644

(('~flt', 0.0, ('fabs', a)), ('fneu', a, 0.0)),

645

646

# -fabs(a) < 0.0

647

# fabs(a) > 0.0

648

(('~flt', ('fneg', ('fabs', a)), 0.0), ('fneu', a, 0.0)),

649

650

# 0.0 >= fabs(a)

651

# 0.0 == fabs(a) because fabs(a) must be >= 0

652

# 0.0 == a

653

(('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)),

654

655

# -fabs(a) >= 0.0

656

# 0.0 >= fabs(a)

657

(('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),

658

659

# (a >= 0.0) && (a <= 1.0) -> fsat(a) == a

660

661

# This should be NaN safe.

662

663

# NaN >= 0 && 1 >= NaN -> false && false -> false

664

665

# vs.

666

667

# NaN == fsat(NaN) -> NaN == 0 -> false

668

(('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'),

669

670

# Note: fmin(-a, -b) == -fmax(a, b)

671

(('fmax', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('ior', a, b))),

672

(('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))),

673

(('fmin', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))),

674

(('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))),

675

676

# fmin(b2f(a), b)

677

# bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b))

678

# bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b))

679

# bcsel(a, fmin(1.0, b), fmin(0.0, b))

680

681

# Since b is a constant, constant folding will eliminate the fmin and the

682

# fmax. If b is > 1.0, the bcsel will be replaced with a b2f.

683

(('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))),

684

685

(('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)),

686

687

(('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),

688

(('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)),

689

(('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)),

690

(('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)),

691

(('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)),

692

(('bcsel', ('i2b', a), b, c), ('bcsel', ('ine', a, 0), b, c)),

693

(('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)),

694

(('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),

695

(('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)),

696

(('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),

697

(('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),

698

(('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),

699

(('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),

700

(('bcsel', a, True, b), ('ior', a, b)),

701

(('bcsel', a, a, b), ('ior', a, b)),

702

(('bcsel', a, b, False), ('iand', a, b)),

703

(('bcsel', a, b, a), ('iand', a, b)),

704

(('~fmin', a, a), a),

705

(('~fmax', a, a), a),

706

(('imin', a, a), a),

707

(('imax', a, a), a),

708

(('umin', a, a), a),

709

(('umin', a, 0), 0),

710

(('umin', a, -1), a),

711

(('umax', a, a), a),

712

(('umax', a, 0), a),

713

(('umax', a, -1), -1),

714

(('fmax', ('fmax', a, b), b), ('fmax', a, b)),

715

(('umax', ('umax', a, b), b), ('umax', a, b)),

716

(('imax', ('imax', a, b), b), ('imax', a, b)),

717

(('fmin', ('fmin', a, b), b), ('fmin', a, b)),

718

(('umin', ('umin', a, b), b), ('umin', a, b)),

719

(('imin', ('imin', a, b), b), ('imin', a, b)),

720

(('fmax', ('fmax', ('fmax', a, b), c), a), ('fmax', ('fmax', a, b), c)),

721

(('umax', ('umax', ('umax', a, b), c), a), ('umax', ('umax', a, b), c)),

722

(('imax', ('imax', ('imax', a, b), c), a), ('imax', ('imax', a, b), c)),

723

(('fmin', ('fmin', ('fmin', a, b), c), a), ('fmin', ('fmin', a, b), c)),

724

(('umin', ('umin', ('umin', a, b), c), a), ('umin', ('umin', a, b), c)),

725

(('imin', ('imin', ('imin', a, b), c), a), ('imin', ('imin', a, b), c)),

726

])

727

728

for N in [8, 16, 32, 64]:

729

b2iN = 'b2i{0}'.format(N)

730

optimizations.extend([

731

(('ieq', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ieq', a, b)),

732

(('ine', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ine', a, b)),

733

])

734

735

for N in [16, 32, 64]:

736

b2fN = 'b2f{0}'.format(N)

737

optimizations.extend([

738

(('feq', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ieq', a, b)),

739

(('fneu', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ine', a, b)),

740

])

741

742

# Integer sizes

743

for s in [8, 16, 32, 64]:

744

optimizations.extend([

745

(('iand@{}'.format(s), a, ('inot', ('ishr', a, s - 1))), ('imax', a, 0)),

746

747

# Simplify logic to detect sign of an integer.

748

(('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0), ('ige', a, 0)),

749

(('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ige', a, 0)),

750

(('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0), ('ilt', a, 0)),

751

(('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ilt', a, 0)),

752

(('ine', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)),

753

(('ieq', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)),

754

(('ieq', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ilt', a, 0)),

755

(('ine', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ige', a, 0)),

756

(('ine', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)),

757

(('ieq', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)),

758

(('ieq', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ilt', a, 0)),

759

(('ine', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ige', a, 0)),

760

])

761

762

optimizations.extend([

763

(('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))),

764

(('imin', a, ('ineg', a)), ('ineg', ('iabs', a))),

765

(('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))),

766

(('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))),

767

(('~fmin', a, ('fabs', a)), a),

768

(('imin', a, ('iabs', a)), a),

769

(('~fmax', a, ('fneg', ('fabs', a))), a),

770

(('imax', a, ('ineg', ('iabs', a))), a),

771

(('fmax', a, ('fabs', a)), ('fabs', a)),

772

(('imax', a, ('iabs', a)), ('iabs', a)),

773

(('fmax', a, ('fneg', a)), ('fabs', a)),

774

(('imax', a, ('ineg', a)), ('iabs', a), '!options->lower_iabs'),

775

(('~fmax', ('fabs', a), 0.0), ('fabs', a)),

776

(('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),

777

# fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while

778

# fsat(a) returns 0.0.

779

(('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),

780

# fmin(fmax(a, -1.0), 0.0) is inexact because it returns -1.0 on NaN, while

781

# fneg(fsat(fneg(a))) returns -0.0 on NaN.

782

(('~fmin', ('fmax', a, -1.0), 0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),

783

# fmax(fmin(a, 0.0), -1.0) is inexact because it returns 0.0 on NaN, while

784

# fneg(fsat(fneg(a))) returns -0.0 on NaN. This only matters if

785

# SignedZeroInfNanPreserve is set, but we don't currently have any way of

786

# representing this in the optimizations other than the usual ~.

787

(('~fmax', ('fmin', a, 0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),

788

# fsat(fsign(NaN)) = fsat(0) = 0, and b2f(0 < NaN) = b2f(False) = 0. Mark

789

# the new comparison precise to prevent it being changed to 'a != 0'.

790

(('fsat', ('fsign', a)), ('b2f', ('!flt', 0.0, a))),

791

(('fsat', ('b2f', a)), ('b2f', a)),

792

(('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),

793

(('fsat', ('fsat', a)), ('fsat', a)),

794

(('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'),

795

(('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'),

796

(('fsat', ('fneg(is_used_once)', ('fmulz(is_used_once)', a, b))), ('fsat', ('fmulz', ('fneg', a), b)), '!options->lower_fsat && !'+signed_zero_inf_nan_preserve_32),

797

(('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'),

798

(('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)),

799

(('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)),

800

(('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)),

801

# Both the left and right patterns are "b" when isnan(a), so this is exact.

802

(('fmax', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmax', a, b))),

803

# The left pattern is 0.0 when isnan(a) (because fmin(fsat(NaN), b) ->

804

# fmin(0.0, b)) while the right one is "b", so this optimization is inexact.

805

(('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))),

806

807

# max(-min(b, a), b) -> max(abs(b), -a)

808

# min(-max(b, a), b) -> min(-abs(b), -a)

809

(('fmax', ('fneg', ('fmin', b, a)), b), ('fmax', ('fabs', b), ('fneg', a))),

810

(('fmin', ('fneg', ('fmax', b, a)), b), ('fmin', ('fneg', ('fabs', b)), ('fneg', a))),

811

812

# If a in [0,b] then b-a is also in [0,b]. Since b in [0,1], max(b-a, 0) =

813

# fsat(b-a).

814

815

# If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0

816

817

# This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0.

818

(('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b(is_zero_to_one)'), 0.0),

819

('fsat', ('fadd', ('fneg', a), b)), '!options->lower_fsat'),

820

821

(('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)),

822

823

# The ior versions are exact because fmin and fmax will always pick a

824

# non-NaN value, if one exists. Therefore (a < NaN) || (a < c) == a <

825

# fmax(NaN, c) == a < c. Mark the fmin or fmax in the replacement as exact

826

# to prevent other optimizations from ruining the "NaN clensing" property

827

# of the fmin or fmax.

828

(('ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('!fmax', b, c))),

829

(('ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('!fmin', a, b), c)),

830

(('ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('!fmin', b, c))),

831

(('ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('!fmax', a, b), c)),

832

(('ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('!fmax', b, c))),

833

(('ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('!fmin', a, b), c)),

834

(('ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('!fmin', b, c))),

835

(('ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('!fmax', a, b), c)),

836

(('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))),

837

(('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)),

838

(('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))),

839

(('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)),

840

(('iand', ('flt', a, '#b(is_a_number)'), ('flt', a, '#c(is_a_number)')), ('flt', a, ('fmin', b, c))),

841

(('iand', ('flt', '#a(is_a_number)', c), ('flt', '#b(is_a_number)', c)), ('flt', ('fmax', a, b), c)),

842

(('iand', ('fge', a, '#b(is_a_number)'), ('fge', a, '#c(is_a_number)')), ('fge', a, ('fmax', b, c))),

843

(('iand', ('fge', '#a(is_a_number)', c), ('fge', '#b(is_a_number)', c)), ('fge', ('fmin', a, b), c)),

844

845

(('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))),

846

(('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)),

847

(('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))),

848

(('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)),

849

(('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))),

850

(('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)),

851

(('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))),

852

(('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)),

853

(('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))),

854

(('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)),

855

(('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))),

856

(('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)),

857

(('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))),

858

(('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)),

859

(('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))),

860

(('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)),

861

862

# A number of shaders contain a pattern like a.x < 0.0 || a.x > 1.0 || a.y

863

# < 0.0, || a.y > 1.0 || ... These patterns rearrange and replace in a

864

# single step. Doing just the replacement can lead to an infinite loop as

865

# the pattern is repeatedly applied to the result of the previous

866

# application of the pattern.

867

(('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, c), d), ('flt', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)),

868

(('ior', ('ior(is_used_once)', ('flt', a, c), d), ('flt(is_used_once)', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)),

869

(('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, b), d), ('flt', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)),

870

(('ior', ('ior(is_used_once)', ('flt', a, b), d), ('flt(is_used_once)', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)),

871

872

# This is how SpvOpFOrdNotEqual might be implemented. If both values are

873

# numbers, then it can be replaced with fneu.

874

(('ior', ('flt', 'a(is_a_number)', 'b(is_a_number)'), ('flt', b, a)), ('fneu', a, b)),

875

])

876

877

# Float sizes

878

for s in [16, 32, 64]:

879

optimizations.extend([

880

# These derive from the previous patterns with the application of b < 0 <=>

881

# 0 < -b. The transformation should be applied if either comparison is

882

# used once as this ensures that the number of comparisons will not

883

# increase. The sources to the ior and iand are not symmetric, so the

884

# rules have to be duplicated to get this behavior.

885

(('ior', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),

886

(('ior', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),

887

(('ior', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),

888

(('ior', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),

889

(('~iand', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),

890

(('~iand', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),

891

(('~iand', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),

892

(('~iand', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),

893

894

# The (i2f32, ...) part is an open-coded fsign. When that is combined

895

# with the bcsel, it's basically copysign(1.0, a). There are some

896

# behavior differences between this pattern and copysign w.r.t. ±0 and

897

# NaN. copysign(x, y) blindly takes the sign bit from y and applies it

898

# to x, regardless of whether either or both values are NaN.

899

900

# If a != a: bcsel(False, 1.0, i2f(b2i(False) - b2i(False))) = 0,

901

# int(NaN >= 0.0) - int(NaN < 0.0) = 0 - 0 = 0

902

# If a == ±0: bcsel(True, 1.0, ...) = 1.0,

903

# int(±0.0 >= 0.0) - int(±0.0 < 0.0) = 1 - 0 = 1

904

905

# For all other values of 'a', the original and replacement behave as

906

# copysign.

907

908

# Marking the replacement comparisons as precise prevents any future

909

# optimizations from replacing either of the comparisons with the

910

# logical-not of the other.

911

912

# Note: Use b2i32 in the replacement because some platforms that

913

# support fp16 don't support int16.

914

(('bcsel@{}'.format(s), ('feq', a, 0.0), 1.0, ('i2f{}'.format(s), ('iadd', ('b2i{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))))),

915

('i2f{}'.format(s), ('iadd', ('b2i32', ('!fge', a, 0.0)), ('ineg', ('b2i32', ('!flt', a, 0.0)))))),

916

917

(('bcsel', a, ('b2f(is_used_once)', 'b@{}'.format(s)), ('b2f', 'c@{}'.format(s))), ('b2f', ('bcsel', a, b, c))),

918

919

# The C spec says, "If the value of the integral part cannot be represented

920

# by the integer type, the behavior is undefined." "Undefined" can mean

921

# "the conversion doesn't happen at all."

922

(('~i2f{}'.format(s), ('f2i', 'a@{}'.format(s))), ('ftrunc', a)),

923

924

# Ironically, mark these as imprecise because removing the conversions may

925

# preserve more precision than doing the conversions (e.g.,

926

# uint(float(0x81818181u)) == 0x81818200).

927

(('~f2i{}'.format(s), ('i2f', 'a@{}'.format(s))), a),

928

(('~f2i{}'.format(s), ('u2f', 'a@{}'.format(s))), a),

929

(('~f2u{}'.format(s), ('i2f', 'a@{}'.format(s))), a),

930

(('~f2u{}'.format(s), ('u2f', 'a@{}'.format(s))), a),

931

932

(('fadd', ('b2f{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('fneg', ('b2f{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))), ('fsign', a), '!options->lower_fsign'),

933

(('iadd', ('b2i{}'.format(s), ('flt', 0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0)))), ('f2i{}'.format(s), ('fsign', a)), '!options->lower_fsign'),

934

])

935

936

# float? -> float? -> floatS ==> float? -> floatS

937

(('~f2f{}'.format(s), ('f2f', a)), ('f2f{}'.format(s), a)),

938

939

# int? -> float? -> floatS ==> int? -> floatS

940

(('~f2f{}'.format(s), ('u2f', a)), ('u2f{}'.format(s), a)),

941

(('~f2f{}'.format(s), ('i2f', a)), ('i2f{}'.format(s), a)),

942

943

# float? -> float? -> intS ==> float? -> intS

944

(('~f2u{}'.format(s), ('f2f', a)), ('f2u{}'.format(s), a)),

945

(('~f2i{}'.format(s), ('f2f', a)), ('f2i{}'.format(s), a)),

946

947

for B in [32, 64]:

948

if s < B:

949

optimizations.extend([

950

# S = smaller, B = bigger

951

# typeS -> typeB -> typeS ==> identity

952

(('f2f{}'.format(s), ('f2f{}'.format(B), 'a@{}'.format(s))), a),

953

(('i2i{}'.format(s), ('i2i{}'.format(B), 'a@{}'.format(s))), a),

954

(('u2u{}'.format(s), ('u2u{}'.format(B), 'a@{}'.format(s))), a),

955

956

# bool1 -> typeB -> typeS ==> bool1 -> typeS

957

(('f2f{}'.format(s), ('b2f{}'.format(B), 'a@1')), ('b2f{}'.format(s), a)),

958

(('i2i{}'.format(s), ('b2i{}'.format(B), 'a@1')), ('b2i{}'.format(s), a)),

959

(('u2u{}'.format(s), ('b2i{}'.format(B), 'a@1')), ('b2i{}'.format(s), a)),

960

961

# floatS -> floatB -> intB ==> floatS -> intB

962

(('f2u{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2u{}'.format(B), a)),

963

(('f2i{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2i{}'.format(B), a)),

964

965

# int? -> floatB -> floatS ==> int? -> floatS

966

(('f2f{}'.format(s), ('u2f{}'.format(B), a)), ('u2f{}'.format(s), a)),

967

(('f2f{}'.format(s), ('i2f{}'.format(B), a)), ('i2f{}'.format(s), a)),

968

969

# intS -> intB -> floatB ==> intS -> floatB

970

(('u2f{}'.format(B), ('u2u{}'.format(B), 'a@{}'.format(s))), ('u2f{}'.format(B), a)),

971

(('i2f{}'.format(B), ('i2i{}'.format(B), 'a@{}'.format(s))), ('i2f{}'.format(B), a)),

972

])

973

974

# mediump variants of the above

975

optimizations.extend([

976

# int32 -> float32 -> float16 ==> int32 -> float16

977

(('f2fmp', ('u2f32', 'a@32')), ('u2fmp', a)),

978

(('f2fmp', ('i2f32', 'a@32')), ('i2fmp', a)),

979

980

# float32 -> float16 -> int16 ==> float32 -> int16

981

(('f2u16', ('f2fmp', 'a@32')), ('f2u16', a)),

982

(('f2i16', ('f2fmp', 'a@32')), ('f2i16', a)),

983

984

# float32 -> int32 -> int16 ==> float32 -> int16

985

(('i2imp', ('f2u32', 'a@32')), ('f2ump', a)),

986

(('i2imp', ('f2i32', 'a@32')), ('f2imp', a)),

987

988

# int32 -> int16 -> float16 ==> int32 -> float16

989

(('u2f16', ('i2imp', 'a@32')), ('u2f16', a)),

990

(('i2f16', ('i2imp', 'a@32')), ('i2f16', a)),

991

])

992

993

# Clean up junk left from 8-bit integer to 16-bit integer lowering.

994

optimizations.extend([

995

# The u2u16(u2u8(X)) just masks off the upper 8-bits of X. This can be

996

# accomplished by mask the upper 8-bit of the immediate operand to the

997

# iand instruction. Often times, both patterns will end up being applied

998

# to the same original expression tree.

999

(('iand', ('u2u16', ('u2u8', 'a@16')), '#b'), ('iand', a, ('iand', b, 0xff))),

1000

(('u2u16', ('u2u8(is_used_once)', ('iand', 'a@16', '#b'))), ('iand', a, ('iand', b, 0xff))),

1001

])

1002

1003

for op in ['iand', 'ior', 'ixor']:

1004

optimizations.extend([

1005

(('u2u8', (op, ('u2u16', ('u2u8', 'a@16')), ('u2u16', ('u2u8', 'b@16')))), ('u2u8', (op, a, b))),

1006

(('u2u8', (op, ('u2u16', ('u2u8', 'a@32')), ('u2u16', ('u2u8', 'b@32')))), ('u2u8', (op, a, b))),

1007

1008

# Undistribute extract from a logic op

1009

((op, ('extract_i8', a, '#b'), ('extract_i8', c, b)), ('extract_i8', (op, a, c), b)),

1010

((op, ('extract_u8', a, '#b'), ('extract_u8', c, b)), ('extract_u8', (op, a, c), b)),

1011

((op, ('extract_i16', a, '#b'), ('extract_i16', c, b)), ('extract_i16', (op, a, c), b)),

1012

((op, ('extract_u16', a, '#b'), ('extract_u16', c, b)), ('extract_u16', (op, a, c), b)),

1013

1014

# Undistribute shifts from a logic op

1015

((op, ('ushr(is_used_once)', a, '#b'), ('ushr', c, b)), ('ushr', (op, a, c), b)),

1016

((op, ('ishr(is_used_once)', a, '#b'), ('ishr', c, b)), ('ishr', (op, a, c), b)),

1017

((op, ('ishl(is_used_once)', a, '#b'), ('ishl', c, b)), ('ishl', (op, a, c), b)),

1018

])

1019

1020

# Integer sizes

1021

for s in [8, 16, 32, 64]:

1022

optimizations.extend([

1023

(('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('ior', a, b), 0), 'options->lower_umax'),

1024

(('ior', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('ior', a, b), 0), 'options->lower_umin'),

1025

(('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umax', a, b), 0), '!options->lower_umax'),

1026

(('ior', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0), '!options->lower_umin'),

1027

(('iand', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0), '!options->lower_umin'),

1028

(('ior', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umax', a, b), 0), '!options->lower_umax'),

1029

1030

# True/False are ~0 and 0 in NIR. b2i of True is 1, and -1 is ~0 (True).

1031

(('ineg', ('b2i{}'.format(s), 'a@{}'.format(s))), a),

1032

1033

# SM5 32-bit shifts are defined to use the 5 least significant bits (or 4 bits for 16 bits)

1034

(('ishl', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishl', a, b)),

1035

(('ishr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishr', a, b)),

1036

(('ushr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ushr', a, b)),

1037

])

1038

1039

optimizations.extend([

1040

# Common pattern like 'if (i == 0 || i == 1 || ...)'

1041

(('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)),

1042

(('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)),

1043

(('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)),

1044

1045

(('ior', a, ('ieq', a, False)), True),

1046

(('ior', a, ('inot', a)), -1),

1047

1048

(('ine', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), ('ine', a, b)),

1049

(('b2i', ('ine', 'a@1', 'b@1')), ('b2i', ('ixor', a, b))),

1050

1051

# This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code.

1052

# The first part of the iand comes from the !__feq64_nonnan.

1053

1054

# The second pattern is a reformulation of the first based on the relation

1055

# (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation

1056

# happens to be y == 0.

1057

(('iand', ('inot', ('iand', ('ior', ('ieq', a, 0), b), c)), ('ilt', a, 0)),

1058

('iand', ('inot', ('iand', b , c)), ('ilt', a, 0))),

1059

(('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)),

1060

('iand', ('inot', ('iand', ('ieq', b , 0), c)), ('ilt', a, 0))),

1061

1062

# These patterns can result when (a < b || a < c) => (a < min(b, c))

1063

# transformations occur before constant propagation and loop-unrolling.

1064

1065

# The flt versions are exact. If isnan(a), the original pattern is

1066

# trivially false, and the replacements are false too. If isnan(b):

1067

1068

# a < fmax(NaN, a) => a < a => false vs a < NaN => false

1069

(('flt', a, ('fmax', b, a)), ('flt', a, b)),

1070

(('flt', ('fmin', a, b), a), ('flt', b, a)),

1071

(('~fge', a, ('fmin', b, a)), True),

1072

(('~fge', ('fmax', a, b), a), True),

1073

(('flt', a, ('fmin', b, a)), False),

1074

(('flt', ('fmax', a, b), a), False),

1075

(('~fge', a, ('fmax', b, a)), ('fge', a, b)),

1076

(('~fge', ('fmin', a, b), a), ('fge', b, a)),

1077

1078

(('ilt', a, ('imax', b, a)), ('ilt', a, b)),

1079

(('ilt', ('imin', a, b), a), ('ilt', b, a)),

1080

(('ige', a, ('imin', b, a)), True),

1081

(('ige', ('imax', a, b), a), True),

1082

(('ult', a, ('umax', b, a)), ('ult', a, b)),

1083

(('ult', ('umin', a, b), a), ('ult', b, a)),

1084

(('uge', a, ('umin', b, a)), True),

1085

(('uge', ('umax', a, b), a), True),

1086

(('ilt', a, ('imin', b, a)), False),

1087

(('ilt', ('imax', a, b), a), False),

1088

(('ige', a, ('imax', b, a)), ('ige', a, b)),

1089

(('ige', ('imin', a, b), a), ('ige', b, a)),

1090

(('ult', a, ('umin', b, a)), False),

1091

(('ult', ('umax', a, b), a), False),

1092

(('uge', a, ('umax', b, a)), ('uge', a, b)),

1093

(('uge', ('umin', a, b), a), ('uge', b, a)),

1094

(('ult', a, ('iand', b, a)), False),

1095

(('ult', ('ior', a, b), a), False),

1096

(('uge', a, ('iand', b, a)), True),

1097

(('uge', ('ior', a, b), a), True),

1098

1099

(('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))),

1100

(('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))),

1101

(('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))),

1102

(('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))),

1103

(('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))),

1104

(('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))),

1105

(('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))),

1106

(('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))),

1107

(('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))),

1108

(('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))),

1109

(('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))),

1110

(('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))),

1111

(('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))),

1112

(('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))),

1113

(('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))),

1114

(('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))),

1115

1116

# Thanks to sign extension, the ishr(a, b) is negative if and only if a is

1117

# negative.

1118

(('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)),

1119

('iabs', ('ishr', a, b))),

1120

(('iabs', ('ishr', ('iabs', a), b)), ('ishr', ('iabs', a), b)),

1121

1122

(('fabs', ('slt', a, b)), ('slt', a, b)),

1123

(('fabs', ('sge', a, b)), ('sge', a, b)),

1124

(('fabs', ('seq', a, b)), ('seq', a, b)),

1125

(('fabs', ('sne', a, b)), ('sne', a, b)),

1126

(('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'),

1127

(('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),

1128

(('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),

1129

(('sne', a, b), ('b2f', ('fneu', a, b)), 'options->lower_scmp'),

1130

(('seq', ('seq', a, b), 1.0), ('seq', a, b)),

1131

(('seq', ('sne', a, b), 1.0), ('sne', a, b)),

1132

(('seq', ('slt', a, b), 1.0), ('slt', a, b)),

1133

(('seq', ('sge', a, b), 1.0), ('sge', a, b)),

1134

(('sne', ('seq', a, b), 0.0), ('seq', a, b)),

1135

(('sne', ('sne', a, b), 0.0), ('sne', a, b)),

1136

(('sne', ('slt', a, b), 0.0), ('slt', a, b)),

1137

(('sne', ('sge', a, b), 0.0), ('sge', a, b)),

1138

(('seq', ('seq', a, b), 0.0), ('sne', a, b)),

1139

(('seq', ('sne', a, b), 0.0), ('seq', a, b)),

1140

(('seq', ('slt', a, b), 0.0), ('sge', a, b)),

1141

(('seq', ('sge', a, b), 0.0), ('slt', a, b)),

1142

(('sne', ('seq', a, b), 1.0), ('sne', a, b)),

1143

(('sne', ('sne', a, b), 1.0), ('seq', a, b)),

1144

(('sne', ('slt', a, b), 1.0), ('sge', a, b)),

1145

(('sne', ('sge', a, b), 1.0), ('slt', a, b)),

1146

(('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),

1147

(('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'),

1148

(('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'),

1149

(('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'),

1150

(('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),

1151

(('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),

1152

1153

(('ball_iequal2', a, b), ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),

1154

(('ball_iequal3', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('ieq', 'a.z', 'b.z')), 'options->lower_vector_cmp'),

1155

(('ball_iequal4', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('iand', ('ieq', 'a.z', 'b.z'), ('ieq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),

1156

1157

(('bany_inequal2', a, b), ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), 'options->lower_vector_cmp'),

1158

(('bany_inequal3', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ine', 'a.z', 'b.z')), 'options->lower_vector_cmp'),

1159

(('bany_inequal4', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ior', ('ine', 'a.z', 'b.z'), ('ine', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),

1160

1161

(('ball_fequal2', a, b), ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),

1162

(('ball_fequal3', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('feq', 'a.z', 'b.z')), 'options->lower_vector_cmp'),

1163

(('ball_fequal4', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('iand', ('feq', 'a.z', 'b.z'), ('feq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),

1164

1165

(('bany_fnequal2', a, b), ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), 'options->lower_vector_cmp'),

1166

(('bany_fnequal3', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('fneu', 'a.z', 'b.z')), 'options->lower_vector_cmp'),

1167

(('bany_fnequal4', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('ior', ('fneu', 'a.z', 'b.z'), ('fneu', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),

1168

1169

(('feq', ('seq', a, b), 1.0), ('feq', a, b)),

1170

(('feq', ('sne', a, b), 1.0), ('fneu', a, b)),

1171

(('feq', ('slt', a, b), 1.0), ('flt', a, b)),

1172

(('feq', ('sge', a, b), 1.0), ('fge', a, b)),

1173

(('fneu', ('seq', a, b), 0.0), ('feq', a, b)),

1174

(('fneu', ('sne', a, b), 0.0), ('fneu', a, b)),

1175

(('fneu', ('slt', a, b), 0.0), ('flt', a, b)),

1176

(('fneu', ('sge', a, b), 0.0), ('fge', a, b)),

1177

(('feq', ('seq', a, b), 0.0), ('fneu', a, b)),

1178

(('feq', ('sne', a, b), 0.0), ('feq', a, b)),

1179

(('feq', ('slt', a, b), 0.0), ('fge', a, b)),

1180

(('feq', ('sge', a, b), 0.0), ('flt', a, b)),

1181

(('fneu', ('seq', a, b), 1.0), ('fneu', a, b)),

1182

(('fneu', ('sne', a, b), 1.0), ('feq', a, b)),

1183

(('fneu', ('slt', a, b), 1.0), ('fge', a, b)),

1184

(('fneu', ('sge', a, b), 1.0), ('flt', a, b)),

1185

1186

(('fneu', ('fneg', a), a), ('fneu', a, 0.0)),

1187

(('feq', ('fneg', a), a), ('feq', a, 0.0)),

1188

# Emulating booleans

1189

(('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),

1190

(('iand', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),

1191

(('ior', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))),

1192

(('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))),

1193

(('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))),

1194

(('iand', 'a@bool16', 1.0), ('b2f', a)),

1195

(('iand', 'a@bool32', 1.0), ('b2f', a)),

1196

(('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF.

1197

# Comparison with the same args. Note that these are only done for the

1198

# float versions when the source must be a number. Generally, NaN cmp NaN

1199

# produces the opposite result of X cmp X. flt is the outlier. NaN < NaN

1200

# is false, and, for any number X, X < X is also false.

1201

(('ilt', a, a), False),

1202

(('ige', a, a), True),

1203

(('ieq', a, a), True),

1204

(('ine', a, a), False),

1205

(('ult', a, a), False),

1206

(('uge', a, a), True),

1207

(('flt', a, a), False),

1208

(('fge', 'a(is_a_number)', a), True),

1209

(('feq', 'a(is_a_number)', a), True),

1210

(('fneu', 'a(is_a_number)', a), False),

1211

# Logical and bit operations

1212

(('iand', a, a), a),

1213

(('iand', a, ~0), a),

1214

(('iand', a, 0), 0),

1215

(('ior', a, a), a),

1216

(('ior', a, 0), a),

1217

(('ior', a, True), True),

1218

(('ixor', a, a), 0),

1219

(('ixor', a, 0), a),

1220

(('inot', ('inot', a)), a),

1221

(('ior', ('iand', a, b), b), b),

1222

(('ior', ('ior', a, b), b), ('ior', a, b)),

1223

(('iand', ('ior', a, b), b), b),

1224

(('iand', ('iand', a, b), b), ('iand', a, b)),

1225

# DeMorgan's Laws

1226

(('iand', ('inot', a), ('inot', b)), ('inot', ('ior', a, b))),

1227

(('ior', ('inot', a), ('inot', b)), ('inot', ('iand', a, b))),

1228

# Shift optimizations

1229

(('ishl', 0, a), 0),

1230

(('ishl', a, 0), a),

1231

(('ishr', 0, a), 0),

1232

(('ishr', a, 0), a),

1233

(('ushr', 0, a), 0),

1234

(('ushr', a, 0), a),

1235

(('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),

1236

(('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), '!options->lower_rotate'),

1237

(('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),

1238

(('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), '!options->lower_rotate'),

1239

(('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'),

1240

(('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), '!options->lower_rotate'),

1241

(('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'),

1242

(('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), '!options->lower_rotate'),

1243

(('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), 'options->lower_rotate'),

1244

(('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), 'options->lower_rotate'),

1245

(('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), 'options->lower_rotate'),

1246

(('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), 'options->lower_rotate'),

1247

# Exponential/logarithmic identities

1248

(('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a

1249

(('~flog2', ('fexp2', a)), a), # lg2(2^a) = a

1250

(('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b)

1251

(('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b

1252

(('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),

1253

('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d

1254

(('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)),

1255

(('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)),

1256

(('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))),

1257

(('~fpow', a, 1.0), a),

1258

(('~fpow', a, 2.0), ('fmul', a, a)),

1259

(('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),

1260

(('~fpow', 2.0, a), ('fexp2', a)),

1261

(('~fpow', ('fpow', a, 2.2), 0.454545), a),

1262

(('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),

1263

(('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),

1264

(('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),

1265

(('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),

1266

(('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),

1267

(('~flog2', ('frcp', a)), ('fneg', ('flog2', a))),

1268

(('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),

1269

(('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),

1270

(('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))),

1271

(('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))),

1272

(('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)),

1273

(('~fmulz', ('fsqrt', a), ('fsqrt', a)), ('fabs', a)),

1274

# Division and reciprocal

1275

(('~fdiv', 1.0, a), ('frcp', a)),

1276

(('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),

1277

(('~frcp', ('frcp', a)), a),

1278

(('~frcp', ('fsqrt', a)), ('frsq', a)),

1279

(('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),

1280

(('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),

1281

# Trig

1282

(('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'),

1283

(('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'),

1284

# Boolean simplifications

1285

(('i2b16(is_used_by_if)', a), ('ine16', a, 0)),

1286

(('i2b32(is_used_by_if)', a), ('ine32', a, 0)),

1287

(('i2b1(is_used_by_if)', a), ('ine', a, 0)),

1288

(('ieq', a, True), a),

1289

(('ine(is_not_used_by_if)', a, True), ('inot', a)),

1290

(('ine', a, False), a),

1291

(('ieq(is_not_used_by_if)', a, False), ('inot', 'a')),

1292

(('bcsel', a, True, False), a),

1293

(('bcsel', a, False, True), ('inot', a)),

1294

(('bcsel', True, b, c), b),

1295

(('bcsel', False, b, c), c),

1296

1297

(('bcsel@16', a, 1.0, 0.0), ('b2f', a)),

1298

(('bcsel@16', a, 0.0, 1.0), ('b2f', ('inot', a))),

1299

(('bcsel@16', a, -1.0, -0.0), ('fneg', ('b2f', a))),

1300

(('bcsel@16', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),

1301

(('bcsel@32', a, 1.0, 0.0), ('b2f', a)),

1302

(('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))),

1303

(('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))),

1304

(('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),

1305

(('bcsel@64', a, 1.0, 0.0), ('b2f', a), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),

1306

(('bcsel@64', a, 0.0, 1.0), ('b2f', ('inot', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),

1307

(('bcsel@64', a, -1.0, -0.0), ('fneg', ('b2f', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),

1308

(('bcsel@64', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a))), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),

1309

1310

(('bcsel', a, b, b), b),

1311

(('~fcsel', a, b, b), b),

1312

1313

# D3D Boolean emulation

1314

(('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))),

1315

(('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))),

1316

(('bcsel', a, 1, 0), ('b2i', 'a@1')),

1317

(('bcsel', a, 0, 1), ('b2i', ('inot', a))),

1318

(('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),

1319

('ineg', ('b2i', ('iand', a, b)))),

1320

(('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))),

1321

('ineg', ('b2i', ('ior', a, b)))),

1322

(('ieq', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)),

1323

(('ieq', ('ineg', ('b2i', 'a@1')), -1), a),

1324

(('ine', ('ineg', ('b2i', 'a@1')), 0), a),

1325

(('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)),

1326

(('ige', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)),

1327

(('ilt', ('ineg', ('b2i', 'a@1')), 0), a),

1328

(('ult', 0, ('ineg', ('b2i', 'a@1'))), a),

1329

(('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)),

1330

(('iand', ('ineg', ('b2i', a)), 1), ('b2i', a)),

1331

1332

# With D3D booleans, imax is AND and umax is OR

1333

(('imax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),

1334

('ineg', ('b2i', ('iand', a, b)))),

1335

(('imin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),

1336

('ineg', ('b2i', ('ior', a, b)))),

1337

(('umax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),

1338

('ineg', ('b2i', ('ior', a, b)))),

1339

(('umin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),

1340

('ineg', ('b2i', ('iand', a, b)))),

1341

1342

# Conversions

1343

(('i2b16', ('b2i', 'a@16')), a),

1344

(('i2b32', ('b2i', 'a@32')), a),

1345

(('f2i', ('ftrunc', a)), ('f2i', a)),

1346

(('f2u', ('ftrunc', a)), ('f2u', a)),

1347

(('i2b', ('ineg', a)), ('i2b', a)),

1348

(('i2b', ('iabs', a)), ('i2b', a)),

1349

(('inot', ('f2b1', a)), ('feq', a, 0.0)),

1350

1351

# Conversions from 16 bits to 32 bits and back can always be removed

1352

(('f2fmp', ('f2f32', 'a@16')), a),

1353

(('i2imp', ('i2i32', 'a@16')), a),

1354

(('i2imp', ('u2u32', 'a@16')), a),

1355

1356

(('f2imp', ('f2f32', 'a@16')), ('f2i16', a)),

1357

(('f2ump', ('f2f32', 'a@16')), ('f2u16', a)),

1358

(('i2fmp', ('i2i32', 'a@16')), ('i2f16', a)),

1359

(('u2fmp', ('u2u32', 'a@16')), ('u2f16', a)),

1360

1361

(('f2fmp', ('b2f32', 'a@1')), ('b2f16', a)),

1362

(('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),

1363

(('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),

1364

1365

(('f2imp', ('b2f32', 'a@1')), ('b2i16', a)),

1366

(('f2ump', ('b2f32', 'a@1')), ('b2i16', a)),

1367

(('i2fmp', ('b2i32', 'a@1')), ('b2f16', a)),

1368

(('u2fmp', ('b2i32', 'a@1')), ('b2f16', a)),

1369

1370

# Conversions to 16 bits would be lossy so they should only be removed if

1371

# the instruction was generated by the precision lowering pass.

1372

(('f2f32', ('f2fmp', 'a@32')), a),

1373

(('i2i32', ('i2imp', 'a@32')), a),

1374

(('u2u32', ('i2imp', 'a@32')), a),

1375

1376

(('i2i32', ('f2imp', 'a@32')), ('f2i32', a)),

1377

(('u2u32', ('f2ump', 'a@32')), ('f2u32', a)),

1378

(('f2f32', ('i2fmp', 'a@32')), ('i2f32', a)),

1379

(('f2f32', ('u2fmp', 'a@32')), ('u2f32', a)),

1380

1381

# Conversions from float32 to float64 and back can be removed as long as

1382

# it doesn't need to be precise, since the conversion may e.g. flush denorms

1383

(('~f2f32', ('f2f64', 'a@32')), a),

1384

1385

(('ffloor', 'a(is_integral)'), a),

1386

(('fceil', 'a(is_integral)'), a),

1387

(('ftrunc', 'a(is_integral)'), a),

1388

# fract(x) = x - floor(x), so fract(NaN) = NaN

1389

(('~ffract', 'a(is_integral)'), 0.0),

1390

(('fabs', 'a(is_not_negative)'), a),

1391

(('iabs', 'a(is_not_negative)'), a),

1392

(('fsat', 'a(is_not_positive)'), 0.0),

1393

1394

(('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'),

1395

1396

# The result of the multiply must be in [-1, 0], so the result of the ffma

1397

# must be in [0, 1].

1398

(('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False),

1399

(('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False),

1400

(('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)),

1401

(('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)),

1402

1403

(('fneu', 'a(is_not_zero)', 0.0), True),

1404

(('feq', 'a(is_not_zero)', 0.0), False),

1405

1406

# In this chart, + means value > 0 and - means value < 0.

1407

1408

# + >= + -> unknown 0 >= + -> false - >= + -> false

1409

# + >= 0 -> true 0 >= 0 -> true - >= 0 -> false

1410

# + >= - -> true 0 >= - -> true - >= - -> unknown

1411

1412

# Using grouping conceptually similar to a Karnaugh map...

1413

1414

# (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true

1415

# (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false

1416

# (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false

1417

1418

# The flt / ilt cases just invert the expected result.

1419

1420

# The results expecting true, must be marked imprecise. The results

1421

# expecting false are fine because NaN compared >= or < anything is false.

1422

1423

(('fge', 'a(is_a_number_not_negative)', 'b(is_a_number_not_positive)'), True),

1424

(('fge', 'a(is_not_positive)', 'b(is_gt_zero)'), False),

1425

(('fge', 'a(is_lt_zero)', 'b(is_not_negative)'), False),

1426

1427

(('flt', 'a(is_not_negative)', 'b(is_not_positive)'), False),

1428

(('flt', 'a(is_a_number_not_positive)', 'b(is_a_number_gt_zero)'), True),

1429

(('flt', 'a(is_a_number_lt_zero)', 'b(is_a_number_not_negative)'), True),

1430

1431

(('ine', 'a(is_not_zero)', 0), True),

1432

(('ieq', 'a(is_not_zero)', 0), False),

1433

1434

(('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True),

1435

(('ige', 'a(is_not_positive)', 'b(is_gt_zero)'), False),

1436

(('ige', 'a(is_lt_zero)', 'b(is_not_negative)'), False),

1437

1438

(('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False),

1439

(('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'), True),

1440

(('ilt', 'a(is_lt_zero)', 'b(is_not_negative)'), True),

1441

1442

(('ult', 0, 'a(is_gt_zero)'), True),

1443

(('ult', a, 0), False),

1444

1445

# Packing and then unpacking does nothing

1446

(('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),

1447

(('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b),

1448

(('unpack_64_2x32', ('pack_64_2x32_split', a, b)), ('vec2', a, b)),

1449

(('unpack_64_2x32', ('pack_64_2x32', a)), a),

1450

(('unpack_double_2x32_dxil', ('pack_double_2x32_dxil', a)), a),

1451

(('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),

1452

('unpack_64_2x32_split_y', a)), a),

1453

(('pack_64_2x32', ('vec2', ('unpack_64_2x32_split_x', a),

1454

('unpack_64_2x32_split_y', a))), a),

1455

(('pack_64_2x32', ('unpack_64_2x32', a)), a),

1456

(('pack_double_2x32_dxil', ('unpack_double_2x32_dxil', a)), a),

1457

1458

# Comparing two halves of an unpack separately. While this optimization

1459

# should be correct for non-constant values, it's less obvious that it's

1460

# useful in that case. For constant values, the pack will fold and we're

1461

# guaranteed to reduce the whole tree to one instruction.

1462

(('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'),

1463

('ieq', ('unpack_32_2x16_split_y', a), '#c')),

1464

('ieq', a, ('pack_32_2x16_split', b, c))),

1465

1466

# Byte extraction

1467

(('ushr', 'a@16', 8), ('extract_u8', a, 1), '!options->lower_extract_byte'),

1468

(('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),

1469

(('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'),

1470

(('ishr', 'a@16', 8), ('extract_i8', a, 1), '!options->lower_extract_byte'),

1471

(('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'),

1472

(('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'),

1473

(('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),

1474

1475

# Common pattern in many Vulkan CTS tests that read 8-bit integers from a

1476

# storage buffer.

1477

(('u2u8', ('extract_u16', a, 1)), ('u2u8', ('extract_u8', a, 2)), '!options->lower_extract_byte'),

1478

(('u2u8', ('ushr', a, 8)), ('u2u8', ('extract_u8', a, 1)), '!options->lower_extract_byte'),

1479

1480

# Common pattern after lowering 8-bit integers to 16-bit.

1481

(('i2i16', ('u2u8', ('extract_u8', a, b))), ('i2i16', ('extract_i8', a, b))),

1482

(('u2u16', ('u2u8', ('extract_u8', a, b))), ('u2u16', ('extract_u8', a, b))),

1483

1484

(('ubfe', a, 0, 8), ('extract_u8', a, 0), '!options->lower_extract_byte'),

1485

(('ubfe', a, 8, 8), ('extract_u8', a, 1), '!options->lower_extract_byte'),

1486

(('ubfe', a, 16, 8), ('extract_u8', a, 2), '!options->lower_extract_byte'),

1487

(('ubfe', a, 24, 8), ('extract_u8', a, 3), '!options->lower_extract_byte'),

1488

(('ibfe', a, 0, 8), ('extract_i8', a, 0), '!options->lower_extract_byte'),

1489

(('ibfe', a, 8, 8), ('extract_i8', a, 1), '!options->lower_extract_byte'),

1490

(('ibfe', a, 16, 8), ('extract_i8', a, 2), '!options->lower_extract_byte'),

1491

(('ibfe', a, 24, 8), ('extract_i8', a, 3), '!options->lower_extract_byte'),

1492

1493

(('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)),

1494

(('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)),

1495

1496

# Word extraction

1497

(('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'),

1498

(('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'),

1499

(('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'),

1500

(('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'),

1501

(('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),

1502

1503

(('ubfe', a, 0, 16), ('extract_u16', a, 0), '!options->lower_extract_word'),

1504

(('ubfe', a, 16, 16), ('extract_u16', a, 1), '!options->lower_extract_word'),

1505

(('ibfe', a, 0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'),

1506

(('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'),

1507

1508

# Packing a u8vec4 to write to an SSBO.

1509

(('ior', ('ishl', ('u2u32', 'a@8'), 24), ('ior', ('ishl', ('u2u32', 'b@8'), 16), ('ior', ('ishl', ('u2u32', 'c@8'), 8), ('u2u32', 'd@8')))),

1510

('pack_32_4x8', ('vec4', d, c, b, a)), 'options->has_pack_32_4x8'),

1511

1512

(('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)),

1513

(('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)),

1514

1515

# Lower pack/unpack

1516

(('pack_64_2x32_split', a, b), ('ior', ('u2u64', a), ('ishl', ('u2u64', b), 32)), 'options->lower_pack_64_2x32_split'),

1517

(('pack_32_2x16_split', a, b), ('ior', ('u2u32', a), ('ishl', ('u2u32', b), 16)), 'options->lower_pack_32_2x16_split'),

1518

(('unpack_64_2x32_split_x', a), ('u2u32', a), 'options->lower_unpack_64_2x32_split'),

1519

(('unpack_64_2x32_split_y', a), ('u2u32', ('ushr', a, 32)), 'options->lower_unpack_64_2x32_split'),

1520

(('unpack_32_2x16_split_x', a), ('u2u16', a), 'options->lower_unpack_32_2x16_split'),

1521

(('unpack_32_2x16_split_y', a), ('u2u16', ('ushr', a, 16)), 'options->lower_unpack_32_2x16_split'),

1522

1523

# Useless masking before unpacking

1524

(('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)),

1525

(('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)),

1526

(('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)),

1527

(('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)),

1528

(('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)),

1529

(('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)),

1530

1531

(('unpack_half_2x16_split_x', ('extract_u16', a, 0)), ('unpack_half_2x16_split_x', a)),

1532

(('unpack_half_2x16_split_x', ('extract_u16', a, 1)), ('unpack_half_2x16_split_y', a)),

1533

(('unpack_half_2x16_split_x', ('ushr', a, 16)), ('unpack_half_2x16_split_y', a)),

1534

(('unpack_32_2x16_split_x', ('extract_u16', a, 0)), ('unpack_32_2x16_split_x', a)),

1535

(('unpack_32_2x16_split_x', ('extract_u16', a, 1)), ('unpack_32_2x16_split_y', a)),

1536

1537

# Optimize half packing

1538

(('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))),

1539

(('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))),

1540

1541

(('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),

1542

('pack_half_2x16', ('vec2', a, b))),

1543

(('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),

1544

('pack_half_2x16', ('vec2', a, b))),

1545

1546

(('ishl', ('pack_half_2x16_split', a, 0), 16), ('pack_half_2x16_split', 0, a)),

1547

(('ushr', ('pack_half_2x16_split', 0, a), 16), ('pack_half_2x16_split', a, 0)),

1548

(('extract_u16', ('pack_half_2x16_split', 0, a), 1), ('pack_half_2x16_split', a, 0)),

1549

1550

(('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),

1551

(('ior', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),

1552

1553

(('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)),

1554

(('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)),

1555

(('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)),

1556

(('extract_i8', ('pack_32_4x8_split', a, b, c, d), 3), ('i2i', d)),

1557

(('extract_u8', ('pack_32_4x8_split', a, b, c, d), 0), ('u2u', a)),

1558

(('extract_u8', ('pack_32_4x8_split', a, b, c, d), 1), ('u2u', b)),

1559

(('extract_u8', ('pack_32_4x8_split', a, b, c, d), 2), ('u2u', c)),

1560

(('extract_u8', ('pack_32_4x8_split', a, b, c, d), 3), ('u2u', d)),

1561

])

1562

1563

# After the ('extract_u8', a, 0) pattern, above, triggers, there will be

1564

# patterns like those below.

1565

for op in ('ushr', 'ishr'):

1566

optimizations.extend([(('extract_u8', (op, 'a@16', 8), 0), ('extract_u8', a, 1))])

1567

optimizations.extend([(('extract_u8', (op, 'a@32', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)])

1568

optimizations.extend([(('extract_u8', (op, 'a@64', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)])

1569

1570

optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))])

1571

1572

# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be

1573

# patterns like those below.

1574

for op in ('extract_u8', 'extract_i8'):

1575

optimizations.extend([((op, ('ishl', 'a@16', 8), 1), (op, a, 0))])

1576

optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)])

1577

optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)])

1578

1579

optimizations.extend([

1580

# Subtracts

1581

(('ussub_4x8_vc4', a, 0), a),

1582

(('ussub_4x8_vc4', a, ~0), 0),

1583

# Lower all Subtractions first - they can get recombined later

1584

(('fsub', a, b), ('fadd', a, ('fneg', b))),

1585

(('isub', a, b), ('iadd', a, ('ineg', b))),

1586

(('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),

1587

# This is correct. We don't need isub_sat because the result type is unsigned, so it cannot overflow.

1588

(('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),

1589

1590

# Propagate negation up multiplication chains

1591

(('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))),

1592

(('fmulz(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmulz', a, b)), '!'+signed_zero_inf_nan_preserve_32),

1593

(('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),

1594

(('ffmaz', ('fneg', a), ('fneg', b), c), ('ffmaz', a, b, c)),

1595

(('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),

1596

1597

# Propagate constants up multiplication chains

1598

(('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)),

1599

(('~fmulz(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmulz', ('fmulz', a, c), b)),

1600

(('~fmul(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)'), ('fmulz', ('fmul', a, c), b)),

1601

(('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)),

1602

(('~ffma', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffma', ('fmul', a, c), b, d)),

1603

(('~ffmaz', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffmaz', ('fmulz', a, c), b, d)),

1604

(('~ffma', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)', d), ('ffmaz', ('fmul', a, c), b, d)),

1605

# Prefer moving out a multiplication for more MAD/FMA-friendly code

1606

(('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_fmul)'), '#c'), ('fadd', ('fadd', a, c), b)),

1607

(('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)),

1608

(('~fadd(is_used_once)', ('ffma(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffma', a, b, d), c)),

1609

(('~fadd(is_used_once)', ('ffmaz(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffmaz', a, b, d), c)),

1610

(('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)),

1611

1612

# Reassociate constants in add/mul chains so they can be folded together.

1613

# For now, we mostly only handle cases where the constants are separated by

1614

# a single non-constant. We could do better eventually.

1615

(('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)),

1616

(('~fmulz', '#a', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmulz', a, c), b)),

1617

(('~fmul', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmul', a, c), b)),

1618

(('~ffma', '#a', ('fmul', 'b(is_not_const)', '#c'), d), ('ffma', ('fmul', a, c), b, d)),

1619

(('~ffmaz', '#a', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmulz', a, c), b, d)),

1620

(('~ffmaz', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmul', a, c), b, d)),

1621

(('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)),

1622

(('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)),

1623

(('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),

1624

(('~fadd', '#a', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffma', b, c, ('fadd', a, d))),

1625

(('~fadd', '#a', ('fneg', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffma', ('fneg', b), c, ('fadd', a, ('fneg', d)))),

1626

(('~fadd', '#a', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffmaz', b, c, ('fadd', a, d))),

1627

(('~fadd', '#a', ('fneg', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffmaz', ('fneg', b), c, ('fadd', a, ('fneg', d)))),

1628

(('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)),

1629

(('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)),

1630

(('ior', '#a', ('ior', 'b(is_not_const)', '#c')), ('ior', ('ior', a, c), b)),

1631

(('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)),

1632

1633

# Reassociate add chains for more MAD/FMA-friendly code

1634

(('~fadd', ('fadd(is_used_once)', 'a(is_fmul)', 'b(is_fmul)'), 'c(is_not_fmul)'), ('fadd', ('fadd', a, c), b)),

1635

1636

# Drop mul-div by the same value when there's no wrapping.

1637

(('idiv', ('imul(no_signed_wrap)', a, b), b), a),

1638

1639

# By definition...

1640

(('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)),

1641

(('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)),

1642

(('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)),

1643

1644

(('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)),

1645

(('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)),

1646

(('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)),

1647

1648

(('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),

1649

1650

(('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),

1651

(('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),

1652

(('~fmulz', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),

1653

(('~fmulz', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),

1654

(('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)),

1655

1656

(('bcsel', a, ('bcsel', b, c, d), d), ('bcsel', ('iand', a, b), c, d)),

1657

(('bcsel', a, b, ('bcsel', c, b, d)), ('bcsel', ('ior', a, c), b, d)),

1658

1659

# Misc. lowering

1660

(('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),

1661

(('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),

1662

(('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),

1663

(('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),

1664

1665

(('bitfield_insert', 'base', 'insert', 'offset', 'bits'),

1666

('bcsel', ('ult', 31, 'bits'), 'insert',

1667

('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')),

1668

'options->lower_bitfield_insert'),

1669

(('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),

1670

(('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),

1671

(('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),

1672

(('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),

1673

(('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),

1674

(('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),

1675

(('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),

1676

(('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),

1677

1678

(('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'),

1679

(('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat'),

1680

(('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_uadd_sat'),

1681

(('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), '(options->lower_int64_options & nir_lower_usub_sat64) != 0'),

1682

1683

# int64_t sum = a + b;

1684

1685

# if (a < 0 && b < 0 && a < sum)

1686

# sum = INT64_MIN;

1687

# } else if (a >= 0 && b >= 0 && sum < a)

1688

# sum = INT64_MAX;

1689

# }

1690

1691

# A couple optimizations are applied.

1692

1693

# 1. a < sum => sum >= 0. This replacement works because it is known that

1694

# a < 0 and b < 0, so sum should also be < 0 unless there was

1695

# underflow.

1696

1697

# 2. sum < a => sum < 0. This replacement works because it is known that

1698

# a >= 0 and b >= 0, so sum should also be >= 0 unless there was

1699

# overflow.

1700

1701

# 3. Invert the second if-condition and swap the order of parameters for

1702

# the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >=

1703

# 0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0)

1704

1705

# On Intel Gen11, this saves ~11 instructions.

1706

(('iadd_sat@64', a, b), ('bcsel',

1707

('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),

1708

0x8000000000000000,

1709

('bcsel',

1710

('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),

1711

('iadd', a, b),

1712

0x7fffffffffffffff)),

1713

'(options->lower_int64_options & nir_lower_iadd_sat64) != 0'),

1714

1715

# int64_t sum = a - b;

1716

1717

# if (a < 0 && b >= 0 && a < sum)

1718

# sum = INT64_MIN;

1719

# } else if (a >= 0 && b < 0 && a >= sum)

1720

# sum = INT64_MAX;

1721

# }

1722

1723

# Optimizations similar to the iadd_sat case are applied here.

1724

(('isub_sat@64', a, b), ('bcsel',

1725

('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),

1726

0x8000000000000000,

1727

('bcsel',

1728

('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),

1729

('isub', a, b),

1730

0x7fffffffffffffff)),

1731

'(options->lower_int64_options & nir_lower_iadd_sat64) != 0'),

1732

1733

# These are done here instead of in the backend because the int64 lowering

1734

# pass will make a mess of the patterns. The first patterns are

1735

# conditioned on nir_lower_minmax64 because it was not clear that it was

1736

# always an improvement on platforms that have real int64 support. No

1737

# shaders in shader-db hit this, so it was hard to say one way or the

1738

# other.

1739

(('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),

1740

(('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),

1741

(('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),

1742

(('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),

1743

(('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),

1744

(('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),

1745

1746

(('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),

1747

(('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),

1748

# 0u < uint(a) <=> uint(a) != 0u

1749

(('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),

1750

1751

# Alternative lowering that doesn't rely on bfi.

1752

(('bitfield_insert', 'base', 'insert', 'offset', 'bits'),

1753

('bcsel', ('ult', 31, 'bits'),

1754

'insert',

1755

(('ior',

1756

('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))),

1757

('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))),

1758

'options->lower_bitfield_insert_to_shifts'),

1759

1760

# Alternative lowering that uses bitfield_select.

1761

(('bitfield_insert', 'base', 'insert', 'offset', 'bits'),

1762

('bcsel', ('ult', 31, 'bits'), 'insert',

1763

('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')),

1764

'options->lower_bitfield_insert_to_bitfield_select'),

1765

1766

(('ibitfield_extract', 'value', 'offset', 'bits'),

1767

('bcsel', ('ult', 31, 'bits'), 'value',

1768

('ibfe', 'value', 'offset', 'bits')),

1769

'options->lower_bitfield_extract'),

1770

1771

(('ubitfield_extract', 'value', 'offset', 'bits'),

1772

('bcsel', ('ult', 31, 'bits'), 'value',

1773

('ubfe', 'value', 'offset', 'bits')),

1774

'options->lower_bitfield_extract'),

1775

1776

# (src0 & src1) | (~src0 & src2). Constant fold if src2 is 0.

1777

(('bitfield_select', a, b, 0), ('iand', a, b)),

1778

(('bitfield_select', a, ('iand', a, b), c), ('bitfield_select', a, b, c)),

1779

1780

# Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits'

1781

(('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')),

1782

(('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')),

1783

(('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')),

1784

(('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')),

1785

(('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')),

1786

(('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')),

1787

1788

# Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:

1789

1790

# If bits is zero, the result will be zero.

1791

1792

# These patterns prevent other patterns from generating invalid results

1793

# when count is zero.

1794

(('ubfe', a, b, 0), 0),

1795

(('ibfe', a, b, 0), 0),

1796

1797

(('ubfe', a, 0, '#b'), ('iand', a, ('ushr', 0xffffffff, ('ineg', b)))),

1798

1799

(('b2i32', ('i2b', ('ubfe', a, b, 1))), ('ubfe', a, b, 1)),

1800

(('b2i32', ('i2b', ('ibfe', a, b, 1))), ('ubfe', a, b, 1)), # ubfe in the replacement is correct

1801

(('ine', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),

1802

(('ieq', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),

1803

(('ine', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),

1804

(('ieq', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),

1805

1806

(('ibitfield_extract', 'value', 'offset', 'bits'),

1807

('bcsel', ('ieq', 0, 'bits'),

1808

1809

('ishr',

1810

('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')),

1811

('isub', 32, 'bits'))),

1812

'options->lower_bitfield_extract_to_shifts'),

1813

1814

(('ubitfield_extract', 'value', 'offset', 'bits'),

1815

('iand',

1816

('ushr', 'value', 'offset'),

1817

('bcsel', ('ieq', 'bits', 32),

1818

0xffffffff,

1819

('isub', ('ishl', 1, 'bits'), 1))),

1820

'options->lower_bitfield_extract_to_shifts'),

1821

1822

(('ifind_msb', 'value'),

1823

('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')),

1824

'options->lower_ifind_msb'),

1825

1826

(('ifind_msb', 'value'),

1827

('bcsel', ('ige', ('ifind_msb_rev', 'value'), 0),

1828

('isub', 31, ('ifind_msb_rev', 'value')),

1829

('ifind_msb_rev', 'value')),

1830

'options->lower_find_msb_to_reverse'),

1831

1832

(('ufind_msb', 'value'),

1833

('bcsel', ('ige', ('ufind_msb_rev', 'value'), 0),

1834

('isub', 31, ('ufind_msb_rev', 'value')),

1835

('ufind_msb_rev', 'value')),

1836

'options->lower_find_msb_to_reverse'),

1837

1838

(('find_lsb', 'value'),

1839

('ufind_msb', ('iand', 'value', ('ineg', 'value'))),

1840

'options->lower_find_lsb'),

1841

1842

(('extract_i8', a, 'b@32'),

1843

('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24),

1844

'options->lower_extract_byte'),

1845

1846

(('extract_u8', a, 'b@32'),

1847

('iand', ('ushr', a, ('imul', b, 8)), 0xff),

1848

'options->lower_extract_byte'),

1849

1850

(('extract_i16', a, 'b@32'),

1851

('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16),

1852

'options->lower_extract_word'),

1853

1854

(('extract_u16', a, 'b@32'),

1855

('iand', ('ushr', a, ('imul', b, 16)), 0xffff),

1856

'options->lower_extract_word'),

1857

1858

(('pack_unorm_2x16', 'v'),

1859

('pack_uvec2_to_uint',

1860

('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))),

1861

'options->lower_pack_unorm_2x16'),

1862

1863

(('pack_unorm_4x8', 'v'),

1864

('pack_uvec4_to_uint',

1865

('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))),

1866

'options->lower_pack_unorm_4x8'),

1867

1868

(('pack_snorm_2x16', 'v'),

1869

('pack_uvec2_to_uint',

1870

('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))),

1871

'options->lower_pack_snorm_2x16'),

1872

1873

(('pack_snorm_4x8', 'v'),

1874

('pack_uvec4_to_uint',

1875

('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))),

1876

'options->lower_pack_snorm_4x8'),

1877

1878

(('unpack_unorm_2x16', 'v'),

1879

('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0),

1880

('extract_u16', 'v', 1))),

1881

65535.0),

1882

'options->lower_unpack_unorm_2x16'),

1883

1884

(('unpack_unorm_4x8', 'v'),

1885

('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0),

1886

('extract_u8', 'v', 1),

1887

('extract_u8', 'v', 2),

1888

('extract_u8', 'v', 3))),

1889

255.0),

1890

'options->lower_unpack_unorm_4x8'),

1891

1892

(('unpack_snorm_2x16', 'v'),

1893

('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0),

1894

('extract_i16', 'v', 1))),

1895

32767.0))),

1896

'options->lower_unpack_snorm_2x16'),

1897

1898

(('unpack_snorm_4x8', 'v'),

1899

('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0),

1900

('extract_i8', 'v', 1),

1901

('extract_i8', 'v', 2),

1902

('extract_i8', 'v', 3))),

1903

127.0))),

1904

'options->lower_unpack_snorm_4x8'),

1905

1906

(('pack_half_2x16_split', 'a@32', 'b@32'),

1907

('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))),

1908

'options->lower_pack_split'),

1909

1910

(('unpack_half_2x16_split_x', 'a@32'),

1911

('f2f32', ('u2u16', a)),

1912

'options->lower_pack_split'),

1913

1914

(('unpack_half_2x16_split_y', 'a@32'),

1915

('f2f32', ('u2u16', ('ushr', a, 16))),

1916

'options->lower_pack_split'),

1917

1918

(('pack_32_2x16_split', 'a@16', 'b@16'),

1919

('ior', ('ishl', ('u2u32', b), 16), ('u2u32', a)),

1920

'options->lower_pack_split'),

1921

1922

(('unpack_32_2x16_split_x', 'a@32'),

1923

('u2u16', a),

1924

'options->lower_pack_split'),

1925

1926

(('unpack_32_2x16_split_y', 'a@32'),

1927

('u2u16', ('ushr', 'a', 16)),

1928

'options->lower_pack_split'),

1929

1930

(('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),

1931

(('imin', ('imax', a, -1), 1), ('isign', a), '!options->lower_isign'),

1932

(('imax', ('imin', a, 1), -1), ('isign', a), '!options->lower_isign'),

1933

# float(0 < NaN) - float(NaN < 0) = float(False) - float(False) = 0 - 0 = 0

1934

# Mark the new comparisons precise to prevent them being changed to 'a !=

1935

# 0' or 'a == 0'.

1936

(('fsign', a), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_fsign'),

1937

1938

# Address/offset calculations:

1939

# Drivers supporting imul24 should use the nir_lower_amul() pass, this

1940

# rule converts everyone else to imul:

1941

(('amul', a, b), ('imul', a, b), '!options->has_imul24'),

1942

1943

(('umul24', a, b),

1944

('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)),

1945

'!options->has_umul24'),

1946

(('umad24', a, b, c),

1947

('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c),

1948

'!options->has_umad24'),

1949

1950

# Relaxed 24bit ops

1951

(('imul24_relaxed', a, b), ('imul24', a, b), 'options->has_imul24'),

1952

(('imul24_relaxed', a, b), ('imul', a, b), '!options->has_imul24'),

1953

(('umad24_relaxed', a, b, c), ('umad24', a, b, c), 'options->has_umad24'),

1954

(('umad24_relaxed', a, b, c), ('iadd', ('umul24_relaxed', a, b), c), '!options->has_umad24'),

1955

(('umul24_relaxed', a, b), ('umul24', a, b), 'options->has_umul24'),

1956

(('umul24_relaxed', a, b), ('imul', a, b), '!options->has_umul24'),

1957

1958

(('imad24_ir3', a, b, 0), ('imul24', a, b)),

1959

(('imad24_ir3', a, 0, c), (c)),

1960

(('imad24_ir3', a, 1, c), ('iadd', a, c)),

1961

1962

# if first two srcs are const, crack apart the imad so constant folding

1963

# can clean up the imul:

1964

# TODO ffma should probably get a similar rule:

1965

(('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)),

1966

1967

# These will turn 24b address/offset calc back into 32b shifts, but

1968

# it should be safe to get back some of the bits of precision that we

1969

# already decided were no necessary:

1970

(('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),

1971

(('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),

1972

(('imul24', a, 0), (0)),

1973

1974

(('fcsel', ('slt', 0, a), b, c), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"),

1975

(('fcsel', ('slt', a, 0), b, c), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),

1976

(('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"),

1977

(('fcsel', ('sge', 0, a), b, c), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),

1978

1979

(('bcsel', ('ilt', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, b, c), "options->has_fused_comp_and_csel"),

1980

(('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b), "options->has_fused_comp_and_csel"),

1981

(('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c), "options->has_fused_comp_and_csel"),

1982

(('bcsel', ('ige', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, c, b), "options->has_fused_comp_and_csel"),

1983

1984

(('bcsel', ('flt', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"),

1985

(('bcsel', ('flt', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),

1986

(('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"),

1987

(('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),

1988

1989

])

1990

1991

# bit_size dependent lowerings

1992

for bit_size in [8, 16, 32, 64]:

1993

# convenience constants

1994

intmax = (1 << (bit_size - 1)) - 1

1995

intmin = 1 << (bit_size - 1)

1996

1997

optimizations += [

1998

(('iadd_sat@' + str(bit_size), a, b),

1999

('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)),

2000

('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_iadd_sat'),

2001

(('isub_sat@' + str(bit_size), a, b),

2002

('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)),

2003

('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_iadd_sat'),

2004

]

2005

2006

invert = OrderedDict([('feq', 'fneu'), ('fneu', 'feq')])

2007

2008

for left, right in itertools.combinations_with_replacement(invert.keys(), 2):

2009

optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))),

2010

('iand', (invert[left], a, b), (invert[right], c, d))))

2011

optimizations.append((('inot', ('iand(is_used_once)', (left, a, b), (right, c, d))),

2012

('ior', (invert[left], a, b), (invert[right], c, d))))

2013

2014

# Optimize x2bN(b2x(x)) -> x

2015

for size in type_sizes('bool'):

2016

aN = 'a@' + str(size)

2017

f2bN = 'f2b' + str(size)

2018

i2bN = 'i2b' + str(size)

2019

optimizations.append(((f2bN, ('b2f', aN)), a))

2020

optimizations.append(((i2bN, ('b2i', aN)), a))

2021

2022

# Optimize x2yN(b2x(x)) -> b2y

2023

for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']):

2024

if x != 'f' and y != 'f' and x != y:

2025

continue

2026

2027

b2x = 'b2f' if x == 'f' else 'b2i'

2028

b2y = 'b2f' if y == 'f' else 'b2i'

2029

x2yN = '{}2{}'.format(x, y)

2030

optimizations.append(((x2yN, (b2x, a)), (b2y, a)))

2031

2032

# Optimize away x2xN(a@N)

2033

for t in ['int', 'uint', 'float', 'bool']:

2034

for N in type_sizes(t):

2035

x2xN = '{0}2{0}{1}'.format(t[0], N)

2036

aN = 'a@{0}'.format(N)

2037

optimizations.append(((x2xN, aN), a))

2038

2039

# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers

2040

# In particular, we can optimize away everything except upcast of downcast and

2041

# upcasts where the type differs from the other cast

2042

for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')):

2043

if N < M:

2044

# The outer cast is a down-cast. It doesn't matter what the size of the

2045

# argument of the inner cast is because we'll never been in the upcast

2046

# of downcast case. Regardless of types, we'll always end up with y2yN

2047

# in the end.

2048

for x, y in itertools.product(['i', 'u'], ['i', 'u']):

2049

x2xN = '{0}2{0}{1}'.format(x, N)

2050

y2yM = '{0}2{0}{1}'.format(y, M)

2051

y2yN = '{0}2{0}{1}'.format(y, N)

2052

optimizations.append(((x2xN, (y2yM, a)), (y2yN, a)))

2053

elif N > M:

2054

# If the outer cast is an up-cast, we have to be more careful about the

2055

# size of the argument of the inner cast and with types. In this case,

2056

# the type is always the type of type up-cast which is given by the

2057

# outer cast.

2058

for P in type_sizes('uint'):

2059

# We can't optimize away up-cast of down-cast.

2060

if M < P:

2061

continue

2062

2063

# Because we're doing down-cast of down-cast, the types always have

2064

# to match between the two casts

2065

for x in ['i', 'u']:

2066

x2xN = '{0}2{0}{1}'.format(x, N)

2067

x2xM = '{0}2{0}{1}'.format(x, M)

2068

aP = 'a@{0}'.format(P)

2069

optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a)))

2070

else:

2071

# The N == M case is handled by other optimizations

2072

pass

2073

2074

# Downcast operations should be able to see through pack

2075

for t in ['i', 'u']:

2076

for N in [8, 16, 32]:

2077

x2xN = '{0}2{0}{1}'.format(t, N)

2078

optimizations += [

2079

((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),

2080

((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),

2081

]

2082

2083

# Optimize comparisons with up-casts

2084

for t in ['int', 'uint', 'float']:

2085

for N, M in itertools.product(type_sizes(t), repeat=2):

2086

if N == 1 or N >= M:

2087

continue

2088

2089

cond = 'true'

2090

if N == 8:

2091

cond = 'options->support_8bit_alu'

2092

elif N == 16:

2093

cond = 'options->support_16bit_alu'

2094

x2xM = '{0}2{0}{1}'.format(t[0], M)

2095

x2xN = '{0}2{0}{1}'.format(t[0], N)

2096

aN = 'a@' + str(N)

2097

bN = 'b@' + str(N)

2098

xeq = 'feq' if t == 'float' else 'ieq'

2099

xne = 'fneu' if t == 'float' else 'ine'

2100

xge = '{0}ge'.format(t[0])

2101

xlt = '{0}lt'.format(t[0])

2102

2103

# Up-casts are lossless so for correctly signed comparisons of

2104

# up-casted values we can do the comparison at the largest of the two

2105

# original sizes and drop one or both of the casts. (We have

2106

# optimizations to drop the no-op casts which this may generate.)

2107

for P in type_sizes(t):

2108

if P == 1 or P > N:

2109

continue

2110

2111

bP = 'b@' + str(P)

2112

optimizations += [

2113

((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond),

2114

((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond),

2115

((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond),

2116

((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond),

2117

((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond),

2118

((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond),

2119

]

2120

2121

# The next bit doesn't work on floats because the range checks would

2122

# get way too complicated.

2123

if t in ['int', 'uint']:

2124

if t == 'int':

2125

xN_min = -(1 << (N - 1))

2126

xN_max = (1 << (N - 1)) - 1

2127

elif t == 'uint':

2128

xN_min = 0

2129

xN_max = (1 << N) - 1

2130

else:

2131

assert False

2132

2133

# If we're up-casting and comparing to a constant, we can unfold

2134

# the comparison into a comparison with the shrunk down constant

2135

# and a check that the constant fits in the smaller bit size.

2136

optimizations += [

2137

((xeq, (x2xM, aN), '#b'),

2138

('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond),

2139

((xne, (x2xM, aN), '#b'),

2140

('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond),

2141

((xlt, (x2xM, aN), '#b'),

2142

('iand', (xlt, xN_min, b),

2143

('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond),

2144

((xlt, '#a', (x2xM, bN)),

2145

('iand', (xlt, a, xN_max),

2146

('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond),

2147

((xge, (x2xM, aN), '#b'),

2148

('iand', (xge, xN_max, b),

2149

('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond),

2150

((xge, '#a', (x2xM, bN)),

2151

('iand', (xge, a, xN_min),

2152

('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond),

2153

]

2154

2155

# Convert masking followed by signed downcast to just unsigned downcast

2156

optimizations += [

2157

(('i2i32', ('iand', 'a@64', 0xffffffff)), ('u2u32', a)),

2158

(('i2i16', ('iand', 'a@32', 0xffff)), ('u2u16', a)),

2159

(('i2i16', ('iand', 'a@64', 0xffff)), ('u2u16', a)),

2160

(('i2i8', ('iand', 'a@16', 0xff)), ('u2u8', a)),

2161

(('i2i8', ('iand', 'a@32', 0xff)), ('u2u8', a)),

2162

(('i2i8', ('iand', 'a@64', 0xff)), ('u2u8', a)),

2163

]

2164

2165

# Some operations such as iadd have the property that the bottom N bits of the

2166

# output only depends on the bottom N bits of each of the inputs so we can

2167

# remove casts

2168

for N in [16, 32]:

2169

for M in [8, 16]:

2170

if M >= N:

2171

continue

2172

2173

aN = 'a@' + str(N)

2174

u2uM = 'u2u{0}'.format(M)

2175

i2iM = 'i2i{0}'.format(M)

2176

2177

for x in ['u', 'i']:

2178

x2xN = '{0}2{0}{1}'.format(x, N)

2179

extract_xM = 'extract_{0}{1}'.format(x, M)

2180

2181

x2xN_M_bits = '{0}(only_lower_{1}_bits_used)'.format(x2xN, M)

2182

extract_xM_M_bits = \

2183

'{0}(only_lower_{1}_bits_used)'.format(extract_xM, M)

2184

optimizations += [

2185

((x2xN_M_bits, (u2uM, aN)), a),

2186

((extract_xM_M_bits, aN, 0), a),

2187

]

2188

2189

bcsel_M_bits = 'bcsel(only_lower_{0}_bits_used)'.format(M)

2190

optimizations += [

2191

((bcsel_M_bits, c, (x2xN, (u2uM, aN)), b), ('bcsel', c, a, b)),

2192

((bcsel_M_bits, c, (x2xN, (i2iM, aN)), b), ('bcsel', c, a, b)),

2193

((bcsel_M_bits, c, (extract_xM, aN, 0), b), ('bcsel', c, a, b)),

2194

]

2195

2196

for op in ['iadd', 'imul', 'iand', 'ior', 'ixor']:

2197

op_M_bits = '{0}(only_lower_{1}_bits_used)'.format(op, M)

2198

optimizations += [

2199

((op_M_bits, (x2xN, (u2uM, aN)), b), (op, a, b)),

2200

((op_M_bits, (x2xN, (i2iM, aN)), b), (op, a, b)),

2201

((op_M_bits, (extract_xM, aN, 0), b), (op, a, b)),

2202

]

2203

2204

def fexp2i(exp, bits):

2205

# Generate an expression which constructs value 2.0^exp or 0.0.

2206

2207

# We assume that exp is already in a valid range:

2208

2209

# * [-15, 15] for 16-bit float

2210

# * [-127, 127] for 32-bit float

2211

# * [-1023, 1023] for 16-bit float

2212

2213

# If exp is the lowest value in the valid range, a value of 0.0 is

2214

# constructed. Otherwise, the value 2.0^exp is constructed.

2215

if bits == 16:

2216

return ('i2i16', ('ishl', ('iadd', exp, 15), 10))

2217

elif bits == 32:

2218

return ('ishl', ('iadd', exp, 127), 23)

2219

elif bits == 64:

2220

return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))

2221

else:

2222

assert False

2223

2224

def ldexp(f, exp, bits):

2225

# The maximum possible range for a normal exponent is [-126, 127] and,

2226

# throwing in denormals, you get a maximum range of [-149, 127]. This

2227

# means that we can potentially have a swing of +-276. If you start with

2228

# FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush

2229

# all the way to zero. The GLSL spec only requires that we handle a subset

2230

# of this range. From version 4.60 of the spec:

2231

2232

# "If exp is greater than +128 (single-precision) or +1024

2233

# (double-precision), the value returned is undefined. If exp is less

2234

# than -126 (single-precision) or -1022 (double-precision), the value

2235

# returned may be flushed to zero. Additionally, splitting the value

2236

# into a significand and exponent using frexp() and then reconstructing

2237

# a floating-point value using ldexp() should yield the original input

2238

# for zero and all finite non-denormalized values."

2239

2240

# The SPIR-V spec has similar language.

2241

2242

# In order to handle the maximum value +128 using the fexp2i() helper

2243

# above, we have to split the exponent in half and do two multiply

2244

# operations.

2245

2246

# First, we clamp exp to a reasonable range. Specifically, we clamp to

2247

# twice the full range that is valid for the fexp2i() function above. If

2248

# exp/2 is the bottom value of that range, the fexp2i() expression will

2249

# yield 0.0f which, when multiplied by f, will flush it to zero which is

2250

# allowed by the GLSL and SPIR-V specs for low exponent values. If the

2251

# value is clamped from above, then it must have been above the supported

2252

# range of the GLSL built-in and therefore any return value is acceptable.

2253

if bits == 16:

2254

exp = ('imin', ('imax', exp, -30), 30)

2255

elif bits == 32:

2256

exp = ('imin', ('imax', exp, -254), 254)

2257

elif bits == 64:

2258

exp = ('imin', ('imax', exp, -2046), 2046)

2259

else:

2260

assert False

2261

2262

# Now we compute two powers of 2, one for exp/2 and one for exp-exp/2.

2263

# (We use ishr which isn't the same for -1, but the -1 case still works

2264

# since we use exp-exp/2 as the second exponent.) While the spec

2265

# technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't

2266

# work with denormals and doesn't allow for the full swing in exponents

2267

# that you can get with normalized values. Instead, we create two powers

2268

# of two and multiply by them each in turn. That way the effective range

2269

# of our exponent is doubled.

2270

pow2_1 = fexp2i(('ishr', exp, 1), bits)

2271

pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits)

2272

return ('fmul', ('fmul', f, pow2_1), pow2_2)

2273

2274

optimizations += [

2275

(('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'),

2276

(('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'),

2277

(('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'),

2278

]

2279

2280

# Unreal Engine 4 demo applications open-codes bitfieldReverse()

2281

def bitfield_reverse_ue4(u):

2282

step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16))

2283

step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8))

2284

step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4))

2285

step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2))

2286

step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1))

2287

2288

return step5

2289

2290

# Cyberpunk 2077 open-codes bitfieldReverse()

2291

def bitfield_reverse_cp2077(u):

2292

step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16))

2293

step2 = ('ior', ('iand', ('ishl', step1, 1), 0xaaaaaaaa), ('iand', ('ushr', step1, 1), 0x55555555))

2294

step3 = ('ior', ('iand', ('ishl', step2, 2), 0xcccccccc), ('iand', ('ushr', step2, 2), 0x33333333))

2295

step4 = ('ior', ('iand', ('ishl', step3, 4), 0xf0f0f0f0), ('iand', ('ushr', step3, 4), 0x0f0f0f0f))

2296

step5 = ('ior(many-comm-expr)', ('iand', ('ishl', step4, 8), 0xff00ff00), ('iand', ('ushr', step4, 8), 0x00ff00ff))

2297

2298

return step5

2299

2300

optimizations += [(bitfield_reverse_ue4('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]

2301

optimizations += [(bitfield_reverse_cp2077('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]

2302

2303

# "all_equal(eq(a, b), vec(~0))" is the same as "all_equal(a, b)"

2304

# "any_nequal(neq(a, b), vec(0))" is the same as "any_nequal(a, b)"

2305

for ncomp in [2, 3, 4, 8, 16]:

2306

optimizations += [

2307

(('ball_iequal' + str(ncomp), ('ieq', a, b), ~0), ('ball_iequal' + str(ncomp), a, b)),

2308

(('ball_iequal' + str(ncomp), ('feq', a, b), ~0), ('ball_fequal' + str(ncomp), a, b)),

2309

(('bany_inequal' + str(ncomp), ('ine', a, b), 0), ('bany_inequal' + str(ncomp), a, b)),

2310

(('bany_inequal' + str(ncomp), ('fneu', a, b), 0), ('bany_fnequal' + str(ncomp), a, b)),

2311

]

2312

2313

# For any float comparison operation, "cmp", if you have "a == a && a cmp b"

2314

# then the "a == a" is redundant because it's equivalent to "a is not NaN"

2315

# and, if a is a NaN then the second comparison will fail anyway.

2316

for op in ['flt', 'fge', 'feq']:

2317

optimizations += [

2318

(('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)),

2319

(('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)),

2320

]

2321

2322

# Add optimizations to handle the case where the result of a ternary is

2323

# compared to a constant. This way we can take things like

2324

2325

# (a ? 0 : 1) > 0

2326

2327

# and turn it into

2328

2329

# a ? (0 > 0) : (1 > 0)

2330

2331

# which constant folding will eat for lunch. The resulting ternary will

2332

# further get cleaned up by the boolean reductions above and we will be

2333

# left with just the original variable "a".

2334

for op in ['feq', 'fneu', 'ieq', 'ine']:

2335

optimizations += [

2336

((op, ('bcsel', 'a', '#b', '#c'), '#d'),

2337

('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))),

2338

]

2339

2340

for op in ['flt', 'fge', 'ilt', 'ige', 'ult', 'uge']:

2341

optimizations += [

2342

((op, ('bcsel', 'a', '#b', '#c'), '#d'),

2343

('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))),

2344

((op, '#d', ('bcsel', a, '#b', '#c')),

2345

('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))),

2346

]

2347

2348

2349

# For example, this converts things like

2350

2351

# 1 + mix(0, a - 1, condition)

2352

2353

# into

2354

2355

# mix(1, (a-1)+1, condition)

2356

2357

# Other optimizations will rearrange the constants.

2358

for op in ['fadd', 'fmul', 'fmulz', 'iadd', 'imul']:

2359

optimizations += [

2360

((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d)))

2361

]

2362

2363

# For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives

2364

# states:

2365

2366

# If neither layout qualifier is specified, derivatives in compute shaders

2367

# return zero, which is consistent with the handling of built-in texture

2368

# functions like texture() in GLSL 4.50 compute shaders.

2369

for op in ['fddx', 'fddx_fine', 'fddx_coarse',

2370

'fddy', 'fddy_fine', 'fddy_coarse']:

2371

optimizations += [

2372

((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE')

2373

]

2374

2375

# Some optimizations for ir3-specific instructions.

2376

optimizations += [

2377

# 'al * bl': If either 'al' or 'bl' is zero, return zero.

2378

(('umul_low', '#a(is_lower_half_zero)', 'b'), (0)),

2379

# '(ah * bl) << 16 + c': If either 'ah' or 'bl' is zero, return 'c'.

2380

(('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')),

2381

(('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')),

2382

]

2383

2384

# These kinds of sequences can occur after nir_opt_peephole_select.

2385

2386

# NOTE: fadd is not handled here because that gets in the way of ffma

2387

# generation in the i965 driver. Instead, fadd and ffma are handled in

2388

# late_optimizations.

2389

2390

for op in ['flrp']:

2391

optimizations += [

2392

(('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),

2393

(('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),

2394

(('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),

2395

(('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),

2396

(('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)),

2397

(('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)),

2398

]

2399

2400

for op in ['fmulz', 'fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']:

2401

optimizations += [

2402

(('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),

2403

(('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))),

2404

(('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),

2405

(('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),

2406

]

2407

2408

for op in ['fpow']:

2409

optimizations += [

2410

(('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),

2411

(('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),

2412

(('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)),

2413

(('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)),

2414

]

2415

2416

for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos', 'fneg', 'fabs', 'fsign']:

2417

optimizations += [

2418

(('bcsel', c, (op + '(is_used_once)', a), (op + '(is_used_once)', b)), (op, ('bcsel', c, a, b))),

2419

]

2420

2421

for op in ['ineg', 'iabs', 'inot', 'isign']:

2422

optimizations += [

2423

((op, ('bcsel', c, '#a', '#b')), ('bcsel', c, (op, a), (op, b))),

2424

]

2425

2426

optimizations.extend([

2427

(('fisnormal', 'a@32'), ('ult', 0x1ffffff, ('iadd', ('ishl', a, 1), 0x1000000)), 'options->lower_fisnormal')

2428

])

2429

2430

# This section contains optimizations to propagate downsizing conversions of

2431

# constructed vectors into vectors of downsized components. Whether this is

2432

# useful depends on the SIMD semantics of the backend. On a true SIMD machine,

2433

# this reduces the register pressure of the vector itself and often enables the

2434

# conversions to be eliminated via other algebraic rules or constant folding.

2435

# In the worst case on a SIMD architecture, the propagated conversions may be

2436

# revectorized via nir_opt_vectorize so instruction count is minimally

2437

# impacted.

2438

2439

# On a machine with SIMD-within-a-register only, this actually

2440

# counterintuitively hurts instruction count. These machines are the same that

2441

# require vectorize_vec2_16bit, so we predicate the optimizations on that flag

2442

# not being set.

2443

2444

# Finally for scalar architectures, there should be no difference in generated

2445

# code since it all ends up scalarized at the end, but it might minimally help

2446

# compile-times.

2447

2448

for i in range(2, 4 + 1):

2449

for T in ('f', 'u', 'i'):

2450

vec_inst = ('vec' + str(i),)

2451

2452

indices = ['a', 'b', 'c', 'd']

2453

suffix_in = tuple((indices[j] + '@32') for j in range(i))

2454

2455

to_16 = '{}2{}16'.format(T, T)

2456

to_mp = '{}2{}mp'.format(T, T)

2457

2458

out_16 = tuple((to_16, indices[j]) for j in range(i))

2459

out_mp = tuple((to_mp, indices[j]) for j in range(i))

2460

2461

optimizations += [

2462

((to_16, vec_inst + suffix_in), vec_inst + out_16, '!options->vectorize_vec2_16bit'),

2463

]

2464

# u2ump doesn't exist, because it's equal to i2imp

2465

if T in ['f', 'i']:

2466

optimizations += [

2467

((to_mp, vec_inst + suffix_in), vec_inst + out_mp, '!options->vectorize_vec2_16bit')

2468

]

2469

2470

# This section contains "late" optimizations that should be run before

2471

# creating ffmas and calling regular optimizations for the final time.

2472

# Optimizations should go here if they help code generation and conflict

2473

# with the regular optimizations.

2474

before_ffma_optimizations = [

2475

# Propagate constants down multiplication chains

2476

(('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)),

2477

(('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)),

2478

(('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)),

2479

(('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)),

2480

2481

(('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),

2482

(('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),

2483

(('~fadd', ('fneg', a), a), 0.0),

2484

(('iadd', ('ineg', a), a), 0),

2485

(('iadd', ('ineg', a), ('iadd', a, b)), b),

2486

(('iadd', a, ('iadd', ('ineg', a), b)), b),

2487

(('~fadd', ('fneg', a), ('fadd', a, b)), b),

2488

(('~fadd', a, ('fadd', ('fneg', a), b)), b),

2489

2490

(('~flrp', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a, 1.0), d), ('fadd', ('flrp', -1.0, 1.0, d), a)),

2491

(('~flrp', ('fadd(is_used_once)', a, 1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp', 1.0, -1.0, d), a)),

2492

(('~flrp', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))),

2493

]

2494

2495

# This section contains "late" optimizations that should be run after the

2496

# regular optimizations have finished. Optimizations should go here if

2497

# they help code generation but do not necessarily produce code that is

2498

# more easily optimizable.

2499

late_optimizations = [

2500

# The rearrangements are fine w.r.t. NaN. However, they produce incorrect

2501

# results if one operand is +Inf and the other is -Inf.

2502

2503

# 1. Inf + -Inf = NaN

2504

# 2. ∀x: x + NaN = NaN and x - NaN = NaN

2505

# 3. ∀x: x != NaN = true

2506

# 4. ∀x, ∀ cmp ∈ {<, >, ≤, ≥, =}: x cmp NaN = false

2507

2508

# a=Inf, b=-Inf a=-Inf, b=Inf a=NaN b=NaN

2509

# (a+b) < 0 false false false false

2510

# a < -b false false false false

2511

# -(a+b) < 0 false false false false

2512

# -a < b false false false false

2513

# (a+b) >= 0 false false false false

2514

# a >= -b true true false false

2515

# -(a+b) >= 0 false false false false

2516

# -a >= b true true false false

2517

# (a+b) == 0 false false false false

2518

# a == -b true true false false

2519

# (a+b) != 0 true true true true

2520

# a != -b false false true true

2521

(('flt', ('fadd(is_used_once)', a, b), 0.0), ('flt', a, ('fneg', b))),

2522

(('flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('flt', ('fneg', a), b)),

2523

(('flt', 0.0, ('fadd(is_used_once)', a, b) ), ('flt', ('fneg', a), b)),

2524

(('flt', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('flt', a, ('fneg', b))),

2525

(('~fge', ('fadd(is_used_once)', a, b), 0.0), ('fge', a, ('fneg', b))),

2526

(('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('fge', ('fneg', a), b)),

2527

(('~fge', 0.0, ('fadd(is_used_once)', a, b) ), ('fge', ('fneg', a), b)),

2528

(('~fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fge', a, ('fneg', b))),

2529

(('~feq', ('fadd(is_used_once)', a, b), 0.0), ('feq', a, ('fneg', b))),

2530

(('~fneu', ('fadd(is_used_once)', a, b), 0.0), ('fneu', a, ('fneg', b))),

2531

2532

# If either source must be finite, then the original (a+b) cannot produce

2533

# NaN due to Inf-Inf. The patterns and the replacements produce the same

2534

# result if b is NaN. Therefore, the replacements are exact.

2535

(('fge', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fge', a, ('fneg', b))),

2536

(('fge', ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b)), 0.0), ('fge', ('fneg', a), b)),

2537

(('fge', 0.0, ('fadd(is_used_once)', 'a(is_finite)', b) ), ('fge', ('fneg', a), b)),

2538

(('fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b))), ('fge', a, ('fneg', b))),

2539

(('feq', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('feq', a, ('fneg', b))),

2540

(('fneu', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fneu', a, ('fneg', b))),

2541

2542

# This is how SpvOpFOrdNotEqual might be implemented. Replace it with

2543

# SpvOpLessOrGreater.

2544

(('iand', ('fneu', a, b), ('iand', ('feq', a, a), ('feq', b, b))), ('ior', ('!flt', a, b), ('!flt', b, a))),

2545

(('iand', ('fneu', a, 0.0), ('feq', a, a) ), ('!flt', 0.0, ('fabs', a))),

2546

2547

# This is how SpvOpFUnordEqual might be implemented. Replace it with

2548

# !SpvOpLessOrGreater.

2549

(('ior', ('feq', a, b), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('ior', ('!flt', a, b), ('!flt', b, a)))),

2550

(('ior', ('feq', a, 0.0), ('fneu', a, a), ), ('inot', ('!flt', 0.0, ('fabs', a)))),

2551

2552

# nir_lower_to_source_mods will collapse this, but its existence during the

2553

# optimization loop can prevent other optimizations.

2554

(('fneg', ('fneg', a)), a),

2555

2556

# re-combine inexact mul+add to ffma. Do this before fsub so that a * b - c

2557

# gets combined to fma(a, b, -c).

2558

(('~fadd@16', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma16'),

2559

(('~fadd@32', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma32'),

2560

(('~fadd@64', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma64'),

2561

(('~fadd@32', ('fmulz', a, b), c), ('ffmaz', a, b, c), 'options->fuse_ffma32'),

2562

2563

# Subtractions get lowered during optimization, so we need to recombine them

2564

(('fadd', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),

2565

(('fneg', a), ('fmul', a, -1.0), 'options->lower_fneg'),

2566

(('iadd', a, ('ineg', 'b')), ('isub', 'a', 'b'), 'options->has_isub || options->lower_ineg'),

2567

(('ineg', a), ('isub', 0, a), 'options->lower_ineg'),

2568

(('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'),

2569

2570

(('iadd', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), 'options->has_iadd3'),

2571

(('iadd', ('isub(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, ('ineg', b), c), 'options->has_iadd3'),

2572

(('isub', ('isub(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, ('ineg', b), ('ineg', c)), 'options->has_iadd3'),

2573

2574

# fneg_lo / fneg_hi

2575

(('vec2(is_only_used_as_float)', ('fneg@16', a), b), ('fmul', ('vec2', a, b), ('vec2', -1.0, 1.0)), 'options->vectorize_vec2_16bit'),

2576

(('vec2(is_only_used_as_float)', a, ('fneg@16', b)), ('fmul', ('vec2', a, b), ('vec2', 1.0, -1.0)), 'options->vectorize_vec2_16bit'),

2577

2578

# These are duplicated from the main optimizations table. The late

2579

# patterns that rearrange expressions like x - .5 < 0 to x < .5 can create

2580

# new patterns like these. The patterns that compare with zero are removed

2581

# because they are unlikely to be created in by anything in

2582

# late_optimizations.

2583

(('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),

2584

(('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),

2585

(('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),

2586

(('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),

2587

2588

(('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),

2589

2590

(('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))),

2591

2592

(('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),

2593

(('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),

2594

(('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),

2595

(('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),

2596

(('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),

2597

(('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),

2598

(('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),

2599

(('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),

2600

(('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)),

2601

(('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),

2602

2603

(('ior', a, a), a),

2604

(('iand', a, a), a),

2605

2606

(('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),

2607

2608

(('fdot2', a, b), ('fdot2_replicated', a, b), 'options->fdot_replicates'),

2609

(('fdot3', a, b), ('fdot3_replicated', a, b), 'options->fdot_replicates'),

2610

(('fdot4', a, b), ('fdot4_replicated', a, b), 'options->fdot_replicates'),

2611

(('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),

2612

2613

(('~flrp', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),

2614

2615

# A similar operation could apply to any ffma(#a, b, #(-a/2)), but this

2616

# particular operation is common for expanding values stored in a texture

2617

# from [0,1] to [-1,1].

2618

(('~ffma@32', a, 2.0, -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'),

2619

(('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'),

2620

(('~ffma@32', a, -2.0, 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'),

2621

(('~ffma@32', a, 2.0, 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),

2622

(('~fadd@32', ('fmul(is_used_once)', 2.0, a), -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'),

2623

(('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'),

2624

(('~fadd@32', ('fmul(is_used_once)', -2.0, a), 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'),

2625

(('~fadd@32', ('fmul(is_used_once)', 2.0, a), 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),

2626

2627

# flrp(a, b, a)

2628

# a*(1-a) + b*a

2629

# a + -a*a + a*b (1)

2630

# a + a*(b - a)

2631

# Option 1: ffma(a, (b-a), a)

2632

2633

# Alternately, after (1):

2634

# a*(1+b) + -a*a

2635

# a*((1+b) + -a)

2636

2637

# Let b=1

2638

2639

# Option 2: ffma(a, 2, -(a*a))

2640

# Option 3: ffma(a, 2, (-a)*a)

2641

# Option 4: ffma(a, -a, (2*a)

2642

# Option 5: a * (2 - a)

2643

2644

# There are a lot of other possible combinations.

2645

(('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'),

2646

(('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'),

2647

(('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),

2648

(('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),

2649

(('~fmul@32', a, ('fadd', 2.0, ('fneg', a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'),

2650

2651

# we do these late so that we don't get in the way of creating ffmas

2652

(('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),

2653

(('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),

2654

2655

# Putting this in 'optimizations' interferes with the bcsel(a, op(b, c),

2656

# op(b, d)) => op(b, bcsel(a, c, d)) transformations. I do not know why.

2657

(('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)),

2658

('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))),

2659

2660

# Things that look like DPH in the source shader may get expanded to

2661

# something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets

2662

# to NIR. After FFMA is generated, this can look like:

2663

2664

# fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w)

2665

2666

# Reassociate the last addition into the first multiplication.

2667

2668

# Some shaders do not use 'invariant' in vertex and (possibly) geometry

2669

# shader stages on some outputs that are intended to be invariant. For

2670

# various reasons, this optimization may not be fully applied in all

2671

# shaders used for different rendering passes of the same geometry. This

2672

# can result in Z-fighting artifacts (at best). For now, disable this

2673

# optimization in these stages. See bugzilla #111490. In tessellation

2674

# stages applications seem to use 'precise' when necessary, so allow the

2675

# optimization in those stages.

2676

(('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),

2677

('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),

2678

(('~fadd', ('ffma(is_used_once)', a, b, ('fmul(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),

2679

('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),

2680

(('~fadd', ('fneg', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'),

2681

('ffma', ('fneg', a), b, ('ffma', ('fneg', c), d, ('ffma', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),

2682

2683

(('~fadd', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),

2684

('ffmaz', a, b, ('ffmaz', c, d, ('ffmaz', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),

2685

(('~fadd', ('ffmaz(is_used_once)', a, b, ('fmulz(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),

2686

('ffmaz', a, b, ('ffmaz', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),

2687

(('~fadd', ('fneg', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'),

2688

('ffmaz', ('fneg', a), b, ('ffmaz', ('fneg', c), d, ('ffmaz', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),

2689

2690

# Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:

2691

2692

# If bits is zero, the result will be zero.

2693

2694

# These prevent the next two lowerings generating incorrect results when

2695

# count is zero.

2696

(('ubfe', a, b, 0), 0),

2697

(('ibfe', a, b, 0), 0),

2698

2699

# On Intel GPUs, BFE is a 3-source instruction. Like all 3-source

2700

# instructions on Intel GPUs, it cannot have an immediate values as

2701

# sources. There are also limitations on source register strides. As a

2702

# result, it is very easy for 3-source instruction combined with either

2703

# loads of immediate values or copies from weird register strides to be

2704

# more expensive than the primitive instructions it represents.

2705

(('ubfe', a, '#b', '#c'), ('iand', ('ushr', 0xffffffff, ('ineg', c)), ('ushr', a, b)), 'options->avoid_ternary_with_two_constants'),

2706

2707

# b is the lowest order bit to be extracted and c is the number of bits to

2708

# extract. The inner shift removes the bits above b + c by shifting left

2709

# 32 - (b + c). ishl only sees the low 5 bits of the shift count, which is

2710

# -(b + c). The outer shift moves the bit that was at b to bit zero.

2711

# After the first shift, that bit is now at b + (32 - (b + c)) or 32 - c.

2712

# This means that it must be shifted right by 32 - c or -c bits.

2713

(('ibfe', a, '#b', '#c'), ('ishr', ('ishl', a, ('ineg', ('iadd', b, c))), ('ineg', c)), 'options->avoid_ternary_with_two_constants'),

2714

2715

# Clean up no-op shifts that may result from the bfe lowerings.

2716

(('ishl', a, 0), a),

2717

(('ishl', a, -32), a),

2718

(('ishr', a, 0), a),

2719

(('ishr', a, -32), a),

2720

(('ushr', a, 0), a),

2721

2722

(('extract_i8', ('extract_i8', a, b), 0), ('extract_i8', a, b)),

2723

(('extract_i8', ('extract_u8', a, b), 0), ('extract_i8', a, b)),

2724

(('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)),

2725

(('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)),

2726

]

2727

2728

# A few more extract cases we'd rather leave late

2729

for N in [16, 32]:

2730

aN = 'a@{0}'.format(N)

2731

u2uM = 'u2u{0}'.format(M)

2732

i2iM = 'i2i{0}'.format(M)

2733

2734

for x in ['u', 'i']:

2735

x2xN = '{0}2{0}{1}'.format(x, N)

2736

extract_x8 = 'extract_{0}8'.format(x)

2737

extract_x16 = 'extract_{0}16'.format(x)

2738

2739

late_optimizations.extend([

2740

((x2xN, ('u2u8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'),

2741

((x2xN, ('i2i8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'),

2742

])

2743

2744

if N > 16:

2745

late_optimizations.extend([

2746

((x2xN, ('u2u16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'),

2747

((x2xN, ('i2i16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'),

2748

])

2749

2750

# Byte insertion

2751

late_optimizations.extend([(('ishl', ('extract_u8', 'a@32', 0), 8 * i), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)])

2752

late_optimizations.extend([(('iand', ('ishl', 'a@32', 8 * i), 0xff << (8 * i)), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)])

2753

late_optimizations.append((('ishl', 'a@32', 24), ('insert_u8', a, 3), '!options->lower_insert_byte'))

2754

2755

late_optimizations += [

2756

# Word insertion

2757

(('ishl', 'a@32', 16), ('insert_u16', a, 1), '!options->lower_insert_word'),

2758

2759

# Extract and then insert

2760

(('insert_u8', ('extract_u8', 'a', 0), b), ('insert_u8', a, b)),

2761

(('insert_u16', ('extract_u16', 'a', 0), b), ('insert_u16', a, b)),

2762

]

2763

2764

# Integer sizes

2765

for s in [8, 16, 32, 64]:

2766

late_optimizations.extend([

2767

(('iand', ('ine(is_used_once)', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0)),

2768

(('ior', ('ieq(is_used_once)', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0)),

2769

])

2770

2771

# Float sizes

2772

for s in [16, 32, 64]:

2773

late_optimizations.extend([

2774

(('~fadd@{}'.format(s), 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp{}'.format(s)),

2775

(('bcsel', a, 0, ('b2f{}'.format(s), ('inot', 'b@bool'))), ('b2f{}'.format(s), ('inot', ('ior', a, b)))),

2776

])

2777

2778

for op in ['fadd']:

2779

late_optimizations += [

2780

(('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),

2781

(('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),

2782

]

2783

2784

for op in ['ffma', 'ffmaz']:

2785

late_optimizations += [

2786

(('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),

2787

(('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),

2788

2789

(('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),

2790

(('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),

2791

]

2792

2793

# mediump: If an opcode is surrounded by conversions, remove the conversions.

2794

# The rationale is that type conversions + the low precision opcode are more

2795

# expensive that the same arithmetic opcode at higher precision.

2796

2797

# This must be done in late optimizations, because we need normal optimizations to

2798

# first eliminate temporary up-conversions such as in op1(f2fmp(f2f32(op2()))).

2799

2800

# Unary opcodes

2801

for op in ['fabs', 'fceil', 'fcos', 'fddx', 'fddx_coarse', 'fddx_fine', 'fddy',

2802

'fddy_coarse', 'fddy_fine', 'fexp2', 'ffloor', 'ffract', 'flog2', 'fneg',

2803

'frcp', 'fround_even', 'frsq', 'fsat', 'fsign', 'fsin', 'fsqrt']:

2804

late_optimizations += [(('~f2f32', (op, ('f2fmp', a))), (op, a))]

2805

2806

# Binary opcodes

2807

for op in ['fadd', 'fdiv', 'fmax', 'fmin', 'fmod', 'fmul', 'fpow', 'frem']:

2808

late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b))), (op, a, b))]

2809

2810

# Ternary opcodes

2811

for op in ['ffma', 'flrp']:

2812

late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b), ('f2fmp', c))), (op, a, b, c))]

2813

2814

# Comparison opcodes

2815

for op in ['feq', 'fge', 'flt', 'fneu']:

2816

late_optimizations += [(('~' + op, ('f2fmp', a), ('f2fmp', b)), (op, a, b))]

2817

2818

# Do this last, so that the f2fmp patterns above have effect.

2819

late_optimizations += [

2820

# Convert *2*mp instructions to concrete *2*16 instructions. At this point

2821

# any conversions that could have been removed will have been removed in

2822

# nir_opt_algebraic so any remaining ones are required.

2823

(('f2fmp', a), ('f2f16', a)),

2824

(('f2imp', a), ('f2i16', a)),

2825

(('f2ump', a), ('f2u16', a)),

2826

(('i2imp', a), ('i2i16', a)),

2827

(('i2fmp', a), ('i2f16', a)),

2828

(('i2imp', a), ('u2u16', a)),

2829

(('u2fmp', a), ('u2f16', a)),

2830

(('fisfinite', a), ('flt', ('fabs', a), float("inf"))),

2831

]

2832

2833

distribute_src_mods = [

2834

# Try to remove some spurious negations rather than pushing them down.

2835

(('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)),

2836

(('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),

2837

(('fdot2_replicated', ('fneg', a), ('fneg', b)), ('fdot2_replicated', a, b)),

2838

(('fdot3_replicated', ('fneg', a), ('fneg', b)), ('fdot3_replicated', a, b)),

2839

(('fdot4_replicated', ('fneg', a), ('fneg', b)), ('fdot4_replicated', a, b)),

2840

(('fneg', ('fneg', a)), a),

2841

2842

(('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)),

2843

(('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))),

2844

2845

(('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),

2846

(('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)),

2847

(('fneg', ('~fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),

2848

2849

# Note that fmin <-> fmax. I don't think there is a way to distribute

2850

# fabs() into fmin or fmax.

2851

(('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))),

2852

(('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))),

2853

2854

(('fneg', ('fdot2_replicated(is_used_once)', a, b)), ('fdot2_replicated', ('fneg', a), b)),

2855

(('fneg', ('fdot3_replicated(is_used_once)', a, b)), ('fdot3_replicated', ('fneg', a), b)),

2856

(('fneg', ('fdot4_replicated(is_used_once)', a, b)), ('fdot4_replicated', ('fneg', a), b)),

2857

2858

# fdph works mostly like fdot, but to get the correct result, the negation

2859

# must be applied to the second source.

2860

(('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))),

2861

2862

(('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))),

2863

(('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))),

2864

]

2865

2866

print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())

2867

print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",

2868

before_ffma_optimizations).render())

2869

print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",

2870

late_optimizations).render())

2871

print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods",

2872

distribute_src_mods).render())

Older »