1
# ***** BEGIN LICENSE BLOCK *****
2
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
4
# The contents of this file are subject to the Mozilla Public License Version
5
# 1.1 (the "License"); you may not use this file except in compliance with
6
# the License. You may obtain a copy of the License at
7
# http://www.mozilla.org/MPL/
9
# Software distributed under the License is distributed on an "AS IS" basis,
10
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11
# for the specific language governing rights and limitations under the
14
# The Original Code is the Solaris software cryptographic token.
16
# The Initial Developer of the Original Code is
17
# Sun Microsystems, Inc.
18
# Portions created by the Initial Developer are Copyright (C) 2005
19
# the Initial Developer. All Rights Reserved.
22
# Sun Microsystems, Inc.
24
# Alternatively, the contents of this file may be used under the terms of
25
# either the GNU General Public License Version 2 or later (the "GPL"), or
26
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27
# in which case the provisions of the GPL or the LGPL are applicable instead
28
# of those above. If you wish to allow use of your version of this file only
29
# under the terms of either the GPL or the LGPL, and not to allow others to
30
# use your version of this file under the terms of the MPL, indicate your
31
# decision by deleting the provisions above and replace them with the notice
32
# and other provisions required by the GPL or the LGPL. If you do not delete
33
# the provisions above, a recipient may use your version of this file under
34
# the terms of any one of the MPL, the GPL or the LGPL.
36
# ***** END LICENSE BLOCK ***** */
39
# ------------------------------------------------------------------------
41
# Implementation of s_mpv_mul_set_vec which exploits
42
# the 64X64->128 bit unsigned multiply instruction.
44
# ------------------------------------------------------------------------
46
# r = a * digit, r and a are vectors of length len
47
# returns the carry digit
48
# r and a are 64 bit aligned.
51
# s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
54
.text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
56
xorq %rax, %rax # if (len == 0) return (0)
60
movq %rdx, %r8 # Use r8 for len; %rdx is used by mul
61
xorq %r9, %r9 # cy = 0
64
cmpq $8, %r8 # 8 - len
66
movq 0(%rsi), %rax # rax = a[0]
67
movq 8(%rsi), %r11 # prefetch a[1]
68
mulq %rcx # p = a[0] * digit
70
adcq $0, %rdx # p += cy
71
movq %rax, 0(%rdi) # r[0] = lo(p)
72
movq %rdx, %r9 # cy = hi(p)
75
movq 16(%rsi), %r11 # prefetch a[2]
76
mulq %rcx # p = a[1] * digit
78
adcq $0, %rdx # p += cy
79
movq %rax, 8(%rdi) # r[1] = lo(p)
80
movq %rdx, %r9 # cy = hi(p)
83
movq 24(%rsi), %r11 # prefetch a[3]
84
mulq %rcx # p = a[2] * digit
86
adcq $0, %rdx # p += cy
87
movq %rax, 16(%rdi) # r[2] = lo(p)
88
movq %rdx, %r9 # cy = hi(p)
91
movq 32(%rsi), %r11 # prefetch a[4]
92
mulq %rcx # p = a[3] * digit
94
adcq $0, %rdx # p += cy
95
movq %rax, 24(%rdi) # r[3] = lo(p)
96
movq %rdx, %r9 # cy = hi(p)
99
movq 40(%rsi), %r11 # prefetch a[5]
100
mulq %rcx # p = a[4] * digit
102
adcq $0, %rdx # p += cy
103
movq %rax, 32(%rdi) # r[4] = lo(p)
104
movq %rdx, %r9 # cy = hi(p)
107
movq 48(%rsi), %r11 # prefetch a[6]
108
mulq %rcx # p = a[5] * digit
110
adcq $0, %rdx # p += cy
111
movq %rax, 40(%rdi) # r[5] = lo(p)
112
movq %rdx, %r9 # cy = hi(p)
115
movq 56(%rsi), %r11 # prefetch a[7]
116
mulq %rcx # p = a[6] * digit
118
adcq $0, %rdx # p += cy
119
movq %rax, 48(%rdi) # r[6] = lo(p)
120
movq %rdx, %r9 # cy = hi(p)
123
mulq %rcx # p = a[7] * digit
125
adcq $0, %rdx # p += cy
126
movq %rax, 56(%rdi) # r[7] = lo(p)
127
movq %rdx, %r9 # cy = hi(p)
138
mulq %rcx # p = a[0] * digit
140
adcq $0, %rdx # p += cy
141
movq %rax, 0(%rdi) # r[0] = lo(p)
142
movq %rdx, %r9 # cy = hi(p)
147
mulq %rcx # p = a[1] * digit
149
adcq $0, %rdx # p += cy
150
movq %rax, 8(%rdi) # r[1] = lo(p)
151
movq %rdx, %r9 # cy = hi(p)
156
mulq %rcx # p = a[2] * digit
158
adcq $0, %rdx # p += cy
159
movq %rax, 16(%rdi) # r[2] = lo(p)
160
movq %rdx, %r9 # cy = hi(p)
165
mulq %rcx # p = a[3] * digit
167
adcq $0, %rdx # p += cy
168
movq %rax, 24(%rdi) # r[3] = lo(p)
169
movq %rdx, %r9 # cy = hi(p)
174
mulq %rcx # p = a[4] * digit
176
adcq $0, %rdx # p += cy
177
movq %rax, 32(%rdi) # r[4] = lo(p)
178
movq %rdx, %r9 # cy = hi(p)
183
mulq %rcx # p = a[5] * digit
185
adcq $0, %rdx # p += cy
186
movq %rax, 40(%rdi) # r[5] = lo(p)
187
movq %rdx, %r9 # cy = hi(p)
192
mulq %rcx # p = a[6] * digit
194
adcq $0, %rdx # p += cy
195
movq %rax, 48(%rdi) # r[6] = lo(p)
196
movq %rdx, %r9 # cy = hi(p)
205
.size s_mpv_mul_set_vec64, [.-s_mpv_mul_set_vec64]
207
# ------------------------------------------------------------------------
209
# Implementation of s_mpv_mul_add_vec which exploits
210
# the 64X64->128 bit unsigned multiply instruction.
212
# ------------------------------------------------------------------------
214
# r += a * digit, r and a are vectors of length len
215
# returns the carry digit
216
# r and a are 64 bit aligned.
219
# s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
222
.text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
224
xorq %rax, %rax # if (len == 0) return (0)
228
movq %rdx, %r8 # Use r8 for len; %rdx is used by mul
229
xorq %r9, %r9 # cy = 0
232
cmpq $8, %r8 # 8 - len
234
movq 0(%rsi), %rax # rax = a[0]
235
movq 0(%rdi), %r10 # r10 = r[0]
236
movq 8(%rsi), %r11 # prefetch a[1]
237
mulq %rcx # p = a[0] * digit
239
adcq $0, %rdx # p += r[0]
240
movq 8(%rdi), %r10 # prefetch r[1]
242
adcq $0, %rdx # p += cy
243
movq %rax, 0(%rdi) # r[0] = lo(p)
244
movq %rdx, %r9 # cy = hi(p)
247
movq 16(%rsi), %r11 # prefetch a[2]
248
mulq %rcx # p = a[1] * digit
250
adcq $0, %rdx # p += r[1]
251
movq 16(%rdi), %r10 # prefetch r[2]
253
adcq $0, %rdx # p += cy
254
movq %rax, 8(%rdi) # r[1] = lo(p)
255
movq %rdx, %r9 # cy = hi(p)
258
movq 24(%rsi), %r11 # prefetch a[3]
259
mulq %rcx # p = a[2] * digit
261
adcq $0, %rdx # p += r[2]
262
movq 24(%rdi), %r10 # prefetch r[3]
264
adcq $0, %rdx # p += cy
265
movq %rax, 16(%rdi) # r[2] = lo(p)
266
movq %rdx, %r9 # cy = hi(p)
269
movq 32(%rsi), %r11 # prefetch a[4]
270
mulq %rcx # p = a[3] * digit
272
adcq $0, %rdx # p += r[3]
273
movq 32(%rdi), %r10 # prefetch r[4]
275
adcq $0, %rdx # p += cy
276
movq %rax, 24(%rdi) # r[3] = lo(p)
277
movq %rdx, %r9 # cy = hi(p)
280
movq 40(%rsi), %r11 # prefetch a[5]
281
mulq %rcx # p = a[4] * digit
283
adcq $0, %rdx # p += r[4]
284
movq 40(%rdi), %r10 # prefetch r[5]
286
adcq $0, %rdx # p += cy
287
movq %rax, 32(%rdi) # r[4] = lo(p)
288
movq %rdx, %r9 # cy = hi(p)
291
movq 48(%rsi), %r11 # prefetch a[6]
292
mulq %rcx # p = a[5] * digit
294
adcq $0, %rdx # p += r[5]
295
movq 48(%rdi), %r10 # prefetch r[6]
297
adcq $0, %rdx # p += cy
298
movq %rax, 40(%rdi) # r[5] = lo(p)
299
movq %rdx, %r9 # cy = hi(p)
302
movq 56(%rsi), %r11 # prefetch a[7]
303
mulq %rcx # p = a[6] * digit
305
adcq $0, %rdx # p += r[6]
306
movq 56(%rdi), %r10 # prefetch r[7]
308
adcq $0, %rdx # p += cy
309
movq %rax, 48(%rdi) # r[6] = lo(p)
310
movq %rdx, %r9 # cy = hi(p)
313
mulq %rcx # p = a[7] * digit
315
adcq $0, %rdx # p += r[7]
317
adcq $0, %rdx # p += cy
318
movq %rax, 56(%rdi) # r[7] = lo(p)
319
movq %rdx, %r9 # cy = hi(p)
331
mulq %rcx # p = a[0] * digit
333
adcq $0, %rdx # p += r[0]
335
adcq $0, %rdx # p += cy
336
movq %rax, 0(%rdi) # r[0] = lo(p)
337
movq %rdx, %r9 # cy = hi(p)
343
mulq %rcx # p = a[1] * digit
345
adcq $0, %rdx # p += r[1]
347
adcq $0, %rdx # p += cy
348
movq %rax, 8(%rdi) # r[1] = lo(p)
349
movq %rdx, %r9 # cy = hi(p)
355
mulq %rcx # p = a[2] * digit
357
adcq $0, %rdx # p += r[2]
359
adcq $0, %rdx # p += cy
360
movq %rax, 16(%rdi) # r[2] = lo(p)
361
movq %rdx, %r9 # cy = hi(p)
367
mulq %rcx # p = a[3] * digit
369
adcq $0, %rdx # p += r[3]
371
adcq $0, %rdx # p += cy
372
movq %rax, 24(%rdi) # r[3] = lo(p)
373
movq %rdx, %r9 # cy = hi(p)
379
mulq %rcx # p = a[4] * digit
381
adcq $0, %rdx # p += r[4]
383
adcq $0, %rdx # p += cy
384
movq %rax, 32(%rdi) # r[4] = lo(p)
385
movq %rdx, %r9 # cy = hi(p)
391
mulq %rcx # p = a[5] * digit
393
adcq $0, %rdx # p += r[5]
395
adcq $0, %rdx # p += cy
396
movq %rax, 40(%rdi) # r[5] = lo(p)
397
movq %rdx, %r9 # cy = hi(p)
403
mulq %rcx # p = a[6] * digit
405
adcq $0, %rdx # p += r[6]
407
adcq $0, %rdx # p += cy
408
movq %rax, 48(%rdi) # r[6] = lo(p)
409
movq %rdx, %r9 # cy = hi(p)
418
.size s_mpv_mul_add_vec64, [.-s_mpv_mul_add_vec64]
420
# Magic indicating no need for an executable stack
421
.section .note.GNU-stack, "", @progbits