112
112
/* Decimal conversion is by far the most typical, and is used
113
113
* for /proc and /sys data. This directly impacts e.g. top performance
114
114
* with many processes running. We optimize it for speed
116
* http://www.cs.uiowa.edu/~jones/bcd/decimal.html
117
* (with permission from the author, Douglas W. Jones). */
119
/* Formats correctly any integer in [0,99999].
120
* Outputs from one to five digits depending on input.
121
* On i386 gcc 4.1.2 -O2: ~250 bytes of code. */
122
static noinline_for_stack
123
char *put_dec_trunc(char *buf, unsigned q)
125
unsigned d3, d2, d1, d0;
130
d0 = 6*(d3 + d2 + d1) + (q & 0xf);
131
q = (d0 * 0xcd) >> 11;
133
*buf++ = d0 + '0'; /* least significant digit */
134
d1 = q + 9*d3 + 5*d2 + d1;
136
q = (d1 * 0xcd) >> 11;
138
*buf++ = d1 + '0'; /* next digit */
141
if ((d2 != 0) || (d3 != 0)) {
144
*buf++ = d2 + '0'; /* next digit */
148
q = (d3 * 0xcd) >> 11;
150
*buf++ = d3 + '0'; /* next digit */
152
*buf++ = q + '0'; /* most sign. digit */
159
/* Same with if's removed. Always emits five digits */
160
static noinline_for_stack
161
char *put_dec_full(char *buf, unsigned q)
163
/* BTW, if q is in [0,9999], 8-bit ints will be enough, */
164
/* but anyway, gcc produces better code with full-sized ints */
165
unsigned d3, d2, d1, d0;
115
* using ideas described at <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
116
* (with permission from the author, Douglas W. Jones).
119
#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64
120
/* Formats correctly any integer in [0, 999999999] */
121
static noinline_for_stack
122
char *put_dec_full9(char *buf, unsigned q)
171
127
* Possible ways to approx. divide by 10
172
* gcc -O2 replaces multiply with shifts and adds
173
* (x * 0xcd) >> 11: 11001101 - shorter code than * 0x67 (on i386)
174
* (x * 0x67) >> 10: 1100111
175
* (x * 0x34) >> 9: 110100 - same
176
* (x * 0x1a) >> 8: 11010 - same
177
* (x * 0x0d) >> 7: 1101 - same, shortest code (on i386)
128
* (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit)
129
* (x * 0xcccd) >> 19 x < 81920 (x < 262149 when 64-bit mul)
130
* (x * 0x6667) >> 18 x < 43699
131
* (x * 0x3334) >> 17 x < 16389
132
* (x * 0x199a) >> 16 x < 16389
133
* (x * 0x0ccd) >> 15 x < 16389
134
* (x * 0x0667) >> 14 x < 2739
135
* (x * 0x0334) >> 13 x < 1029
136
* (x * 0x019a) >> 12 x < 1029
137
* (x * 0x00cd) >> 11 x < 1029 shorter code than * 0x67 (on i386)
138
* (x * 0x0067) >> 10 x < 179
139
* (x * 0x0034) >> 9 x < 69 same
140
* (x * 0x001a) >> 8 x < 69 same
141
* (x * 0x000d) >> 7 x < 69 same, shortest code (on i386)
142
* (x * 0x0007) >> 6 x < 19
143
* See <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
179
d0 = 6*(d3 + d2 + d1) + (q & 0xf);
180
q = (d0 * 0xcd) >> 11;
183
d1 = q + 9*d3 + 5*d2 + d1;
184
q = (d1 * 0xcd) >> 11;
194
q = (d3 * 0xcd) >> 11; /* - shorter code */
195
/* q = (d3 * 0x67) >> 10; - would also work */
145
r = (q * (uint64_t)0x1999999a) >> 32;
146
*buf++ = (q - 10 * r) + '0'; /* 1 */
147
q = (r * (uint64_t)0x1999999a) >> 32;
148
*buf++ = (r - 10 * q) + '0'; /* 2 */
149
r = (q * (uint64_t)0x1999999a) >> 32;
150
*buf++ = (q - 10 * r) + '0'; /* 3 */
151
q = (r * (uint64_t)0x1999999a) >> 32;
152
*buf++ = (r - 10 * q) + '0'; /* 4 */
153
r = (q * (uint64_t)0x1999999a) >> 32;
154
*buf++ = (q - 10 * r) + '0'; /* 5 */
155
/* Now value is under 10000, can avoid 64-bit multiply */
156
q = (r * 0x199a) >> 16;
157
*buf++ = (r - 10 * q) + '0'; /* 6 */
158
r = (q * 0xcd) >> 11;
159
*buf++ = (q - 10 * r) + '0'; /* 7 */
160
q = (r * 0xcd) >> 11;
161
*buf++ = (r - 10 * q) + '0'; /* 8 */
162
*buf++ = q + '0'; /* 9 */
202
/* No inlining helps gcc to use registers better */
167
/* Similar to above but do not pad with zeros.
168
* Code can be easily arranged to print 9 digits too, but our callers
169
* always call put_dec_full9() instead when the number has 9 decimal digits.
203
171
static noinline_for_stack
204
char *put_dec(char *buf, unsigned long long num)
209
return put_dec_trunc(buf, num);
210
rem = do_div(num, 100000);
211
buf = put_dec_full(buf, rem);
172
char *put_dec_trunc8(char *buf, unsigned r)
176
/* Copy of previous function's body with added early returns */
177
q = (r * (uint64_t)0x1999999a) >> 32;
178
*buf++ = (r - 10 * q) + '0'; /* 2 */
181
r = (q * (uint64_t)0x1999999a) >> 32;
182
*buf++ = (q - 10 * r) + '0'; /* 3 */
185
q = (r * (uint64_t)0x1999999a) >> 32;
186
*buf++ = (r - 10 * q) + '0'; /* 4 */
189
r = (q * (uint64_t)0x1999999a) >> 32;
190
*buf++ = (q - 10 * r) + '0'; /* 5 */
193
q = (r * 0x199a) >> 16;
194
*buf++ = (r - 10 * q) + '0'; /* 6 */
197
r = (q * 0xcd) >> 11;
198
*buf++ = (q - 10 * r) + '0'; /* 7 */
201
q = (r * 0xcd) >> 11;
202
*buf++ = (r - 10 * q) + '0'; /* 8 */
205
*buf++ = q + '0'; /* 9 */
209
/* There are two algorithms to print larger numbers.
210
* One is generic: divide by 1000000000 and repeatedly print
211
* groups of (up to) 9 digits. It's conceptually simple,
212
* but requires a (unsigned long long) / 1000000000 division.
214
* Second algorithm splits 64-bit unsigned long long into 16-bit chunks,
215
* manipulates them cleverly and generates groups of 4 decimal digits.
216
* It so happens that it does NOT require long long division.
218
* If long is > 32 bits, division of 64-bit values is relatively easy,
219
* and we will use the first algorithm.
220
* If long long is > 64 bits (strange architecture with VERY large long long),
221
* second algorithm can't be used, and we again use the first one.
223
* Else (if long is 32 bits and long long is 64 bits) we use second one.
226
#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64
228
/* First algorithm: generic */
231
char *put_dec(char *buf, unsigned long long n)
233
if (n >= 100*1000*1000) {
234
while (n >= 1000*1000*1000)
235
buf = put_dec_full9(buf, do_div(n, 1000*1000*1000));
236
if (n >= 100*1000*1000)
237
return put_dec_full9(buf, n);
239
return put_dec_trunc8(buf, n);
244
/* Second algorithm: valid only for 64-bit long longs */
246
static noinline_for_stack
247
char *put_dec_full4(char *buf, unsigned q)
250
r = (q * 0xcccd) >> 19;
251
*buf++ = (q - 10 * r) + '0';
252
q = (r * 0x199a) >> 16;
253
*buf++ = (r - 10 * q) + '0';
254
r = (q * 0xcd) >> 11;
255
*buf++ = (q - 10 * r) + '0';
260
/* Based on code by Douglas W. Jones found at
261
* <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour>
262
* (with permission from the author).
263
* Performs no 64-bit division and hence should be fast on 32-bit machines.
266
char *put_dec(char *buf, unsigned long long n)
268
uint32_t d3, d2, d1, q, h;
270
if (n < 100*1000*1000)
271
return put_dec_trunc8(buf, n);
273
d1 = ((uint32_t)n >> 16); /* implicit "& 0xffff" */
276
d3 = (h >> 16); /* implicit "& 0xffff" */
278
q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
280
buf = put_dec_full4(buf, q % 10000);
283
d1 = q + 7671 * d3 + 9496 * d2 + 6 * d1;
284
buf = put_dec_full4(buf, d1 % 10000);
287
d2 = q + 4749 * d3 + 42 * d2;
288
buf = put_dec_full4(buf, d2 % 10000);
294
buf = put_dec_full4(buf, d3 % 10000);
298
buf = put_dec_full4(buf, q);
300
while (buf[-1] == '0')
216
309
* Convert passed number to decimal string.