2
// A utility for finding substring embeddings in patterns
8
#define MAXPATHS (256*1024)
15
fprintf(stderr,"%s\n",msg);
20
// Finds the index of an entry, only used on xxx_key arrays
21
// Caveat: the table has to be sorted
27
int left=0, right=max-1;
28
while (left <= right) {
29
int mid = ((right-left)/2)+left;
30
int v = strcmp(pat,tab[mid]);
43
// used by partition (which is used by qsort_arr)
53
v=a[i]; a[i]=a[j]; a[j]=v;
54
v=b[i]; b[i]=b[j]; b[j]=v;
67
const char *pivotValue = a[p];
69
swap2(a,b,p,right); // Move pivot to end
71
for (i=left; i<right; i++) {
72
if (strcmp(a[i],pivotValue)<=0) {
77
swap2(a,b,right,p); // Move pivot to its final place
84
static void qsort_arr(
90
while (right > left) {
91
int p = left + (right-left)/2; //select a pivot
92
p = partition(a,b, left, right, p);
93
if ((p-1) - left < right - (p+1)) {
94
qsort_arr(a,b, left, p-1);
97
qsort_arr(a,b, p+1, right);
104
// Removes extra '0' entries from the string
106
static char* compact(
111
for (i=0,j=0; i<l; i++) {
112
if (expr[i]!='0') expr[j++] = expr[i];
119
// convert 'n1im' to 0n1i0m0 expressed as a string
129
for (i=0; i<l; i++) {
131
if ( (last<'0' || last>'9')
132
&& (c <'0' || c >'9')
139
if (last<'0' || last>'9') expr[el++] = '0';
144
// Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der
145
// The second pattern needs to be a right side match of the first
147
static char *combine(
151
int l1 = strlen(expr);
152
int l2 = strlen(subexpr);
155
// this works also for utf8 sequences because the substring is identical
156
// to the last substring-length bytes of expr except for the (single byte)
157
// hyphenation encoders
158
for (j=0; j<l2; j++) {
159
if (subexpr[j]>expr[off+j]) {
160
expr[off+j] = subexpr[j];
169
int main(int argc, const char* argv[]) {
171
char *pattab_key[MAXPATHS];
172
char *pattab_val[MAXPATHS];
174
char *newpattab_key[MAXPATHS];
175
char *newpattab_val[MAXPATHS];
177
char format[132]; // 64+65+newline+zero+spare
179
if (argc!=3) die("Usage: <orig-file> <new-file>\n");
180
if ((in = fopen(argv[1],"r"))==NULL) die("Could not read input");
181
if ((out = fopen(argv[2],"w"))==NULL) die("Could not create output");
182
// read all patterns and split in pure text (_key) & expanded patterns (_val)
183
while(fgets(format,132,in)) {
184
int l = strlen(format);
185
if (format[l-1]=='\n') { l--; format[l]=0; } // Chomp
186
if (format[0]=='%' || format[0]==0) {
189
if (format[l-1]=='%') {
191
format[l] = 0; // remove '%'
194
char *pat = (char*) malloc(l+1);
195
char *org = (char*) malloc(l*2+1);
196
expand(org,format,l);
197
// remove hyphenation encoders (digits) from pat
198
for (i=0,j=0; i<l; i++) {
199
// odd, but utf-8 proof
201
if (c<'0' || c>'9') pat[j++]=c;
205
pattab_key[patterns] = pat;
206
pattab_val[patterns++] = org;
207
if (patterns>MAXPATHS) die("to many base patterns");
211
// As we use binairy search, make sure it is sorted
212
qsort_arr(pattab_key,pattab_val,0,patterns-1);
214
for (p=0; p<patterns; p++) {
215
char *pat = pattab_key[p];
216
int patsize = strlen(pat);
218
for (l=1; l<=patsize; l++) {
219
for (j=1; j<=l; j++) {
223
strncpy(subpat,pat+i,j); subpat[j]=0;
224
if ((subpat_ndx = find_in(pattab_key,patterns,subpat))>=0) {
226
char *newpat=malloc(l+1);
227
//printf("%s is embedded in %s\n",pattab_val[subpat_ndx],pattab_val[p]);
228
strncpy(newpat, pat+0,l); newpat[l]=0;
229
if ((newpat_ndx = find_in(newpattab_key,newpatterns,newpat))<0) {
230
char *neworg = malloc(132); // TODO: compute exact length
231
expand(neworg,newpat,l);
232
newpattab_key[newpatterns] = newpat;
233
newpattab_val[newpatterns++] = combine(neworg,pattab_val[subpat_ndx]);
234
if (newpatterns>MAXPATHS) die("to many new patterns");
235
//printf("%*.*s|%*.*s[%s] (%s|%s) = %s\n",i,i,pat,j,j,pat+i,pat+i+j,pattab_val[p],pattab_val[subpat_ndx],neworg);
238
newpattab_val[newpat_ndx] = combine(
239
newpattab_val[newpat_ndx], pattab_val[subpat_ndx] );
246
/* for some tiny extra speed, one could forget the free()s
247
* as the memory is freed anyway on exit().
248
* However, the gain is minimal and now the code can be cleanly
249
* incorporated into other code */
250
for (p=0; p<newpatterns; p++) {
251
fprintf(out,"%s\n",compact(newpattab_val[p]));
252
free(newpattab_key[p]);
253
free(newpattab_val[p]);
257
for (p=0; p<patterns; p++) {