58
58
#include "idngram_merge.h"
61
void WriteOut(FILE* out, std::map<CSIM_Idngram<N>, unsigned int> & map)
62
WriteOut(FILE* out, std::map<CSIM_Idngram<N>, unsigned int> & map)
63
typedef typename std::map<CSIM_Idngram<N>,unsigned int>::iterator TMapIterator;
64
TMapIterator its=map.begin(), ite=map.end();
65
for (; its != ite; ++its) {
66
fwrite(its->first.ids, sizeof(TSIMWordId), N, out);
67
fwrite(&(its->second), sizeof(unsigned int), 1, out);
64
typedef typename std::map<CSIM_Idngram<N>,
65
unsigned int>::iterator TMapIterator;
66
TMapIterator its = map.begin(), ite = map.end();
67
for (; its != ite; ++its) {
68
fwrite(its->first.ids, sizeof(TSIMWordId), N, out);
69
fwrite(&(its->second), sizeof(unsigned int), 1, out);
73
void ProcessingRead(FILE *fp, FILE* swap, std::vector<long>& para_offsets, size_t paraMax)
76
ProcessingRead(FILE *fp,
78
std::vector<long>& para_offsets,
75
typedef CSIM_Idngram<N> TNgram;
76
typedef typename std::map<CSIM_Idngram<N>, unsigned int> TMap;
81
TSIMWordId* ids = ngram.ids;
82
fread(ids, sizeof(TSIMWordId), N-1, fp);
83
while (fread(ids+N-1, sizeof(TSIMWordId), 1, fp) == 1) {
84
assert (map[ngram] < UINT_MAX);
86
if (map.size() >= paraMax)
88
printf("."); fflush(stdout);
90
para_offsets.push_back(ftell(swap));
92
for (int i=0; i<N-1; ++i) ids[i] = ids[i+1];
95
printf("."); fflush(stdout);
97
para_offsets.push_back(ftell(swap));
81
typedef CSIM_Idngram<N> TNgram;
82
typedef typename std::map<CSIM_Idngram<N>, unsigned int> TMap;
87
TSIMWordId* ids = ngram.ids;
88
fread(ids, sizeof(TSIMWordId), N - 1, fp);
89
while (fread(ids + N - 1, sizeof(TSIMWordId), 1, fp) == 1) {
90
assert(map[ngram] < UINT_MAX);
92
if (map.size() >= paraMax) {
93
printf("."); fflush(stdout);
95
para_offsets.push_back(ftell(swap));
97
for (int i = 0; i < N - 1; ++i) ids[i] = ids[i + 1];
100
printf("."); fflush(stdout);
102
para_offsets.push_back(ftell(swap));
101
106
static struct option long_options[] =
108
{ "NMax", 1, 0, 'n' },
109
{ "out", 1, 0, 'o' },
110
{ "swap", 1, 0, 's' },
111
{ "para", 1, 0, 'p' },
111
static int paraMax=0;
112
static char* output=NULL;
113
static char* swapfile=NULL;
116
static int paraMax = 0;
117
static char* output = NULL;
118
static char* swapfile = NULL;
117
printf("Usage:\n\tids2ngram options idsfile[ idsfile...]\n");
118
printf("\nDescription\n");
119
printf(" This program generate idngram file, which is a sorted [id1,..idN,freq] array, from binary id stream files.\n");
120
printf("\nInput:\n");
121
printf("\tBinary id stream files looks like [id0,...,idX]\n");
122
printf("\nOptions:\n");
123
printf("\t -n N # N-gram\n");
124
printf("\t -s swapfile # intermedia temporary file\n");
125
printf("\t -o outputfile # result idngram file [id1, ... idN, freq]*\n");
126
printf("\t -p para_size # maxium ngram-items per para\n");
127
printf("\nExample:\n");
128
printf(" Following example will use three input idstream file idsfile[1,2,3] to generate the idngram file all.id3gram. Each para (internal map size or hash size) would be 1024000, using swap file for temp result. All temp para result would final be merged to got the final result.\n");
129
printf("\tids2idngram -n 3 -s /tmp/swap -o all.id3gram -p 1024000 idsfile1 idsfile2 idsfile3\n\n");
123
printf("Usage:\n\tids2ngram options idsfile[ idsfile...]\n");
124
printf("\nDescription\n");
126
" This program generate idngram file, which is a sorted [id1,..idN,freq] array, from binary id stream files.\n");
127
printf("\nInput:\n");
128
printf("\tBinary id stream files looks like [id0,...,idX]\n");
129
printf("\nOptions:\n");
130
printf("\t -n N # N-gram\n");
131
printf("\t -s swapfile # intermedia temporary file\n");
133
"\t -o outputfile # result idngram file [id1, ... idN, freq]*\n");
134
printf("\t -p para_size # maxium ngram-items per para\n");
135
printf("\nExample:\n");
137
" Following example will use three input idstream file idsfile[1,2,3] to generate the idngram file all.id3gram. Each para (internal map size or hash size) would be 1024000, using swap file for temp result. All temp para result would final be merged to got the final result.\n");
139
"\tids2idngram -n 3 -s /tmp/swap -o all.id3gram -p 1024000 idsfile1 idsfile2 idsfile3\n\n");
133
static void getParameters(int argc, char* const argv[])
144
getParameters(int argc, char* const argv[])
135
int option_index = 0;
137
while ((c=getopt_long(argc, argv, "p:n:s:o:", long_options, &option_index)) != -1)
141
N = atoi(strdup(optarg));
144
paraMax = atoi(strdup(optarg));
147
output = strdup(optarg);
150
swapfile = strdup(optarg);
156
if (N < 1 || N > 3 || paraMax < 1024 || output == NULL || swapfile == NULL)
146
int option_index = 0;
149
getopt_long(argc, argv, "p:n:s:o:", long_options,
150
&option_index)) != -1) {
153
N = atoi(strdup(optarg));
156
paraMax = atoi(strdup(optarg));
159
output = strdup(optarg);
162
swapfile = strdup(optarg);
168
if (N < 1 || N > 3 || paraMax < 1024 || output == NULL || swapfile == NULL)
160
172
static std::vector<long> para_offsets;
162
int main(int argc, char* argv[])
175
main(int argc, char* argv[])
164
getParameters(argc, argv);
165
FILE *swap = fopen(swapfile, "wb+");
166
FILE *out = fopen(output, "wb+");
167
if (optind >= argc) ShowUsage();
168
while (optind < argc) {
169
printf("Processing %s:", argv[optind]); fflush(stdout);
170
FILE *fp = fopen(argv[optind], "rb");
173
ProcessingRead<1>(fp, swap, para_offsets, paraMax);
176
ProcessingRead<2>(fp, swap, para_offsets, paraMax);
179
ProcessingRead<3>(fp, swap, para_offsets, paraMax);
183
printf ("\n"); fflush(stdout);
186
printf("Merging..."); fflush(stdout);
189
ProcessingIdngramMerge<1>(swap, out, para_offsets);
192
ProcessingIdngramMerge<2>(swap, out, para_offsets);
195
ProcessingIdngramMerge<3>(swap, out, para_offsets);
198
printf ("Done\n"); fflush(stdout);
177
getParameters(argc, argv);
178
FILE *swap = fopen(swapfile, "wb+");
179
FILE *out = fopen(output, "wb+");
180
if (optind >= argc) ShowUsage();
181
while (optind < argc) {
182
printf("Processing %s:", argv[optind]); fflush(stdout);
183
FILE *fp = fopen(argv[optind], "rb");
186
ProcessingRead<1>(fp, swap, para_offsets, paraMax);
189
ProcessingRead<2>(fp, swap, para_offsets, paraMax);
192
ProcessingRead<3>(fp, swap, para_offsets, paraMax);
196
printf("\n"); fflush(stdout);
199
printf("Merging..."); fflush(stdout);
202
ProcessingIdngramMerge<1>(swap, out, para_offsets);
205
ProcessingIdngramMerge<2>(swap, out, para_offsets);
208
ProcessingIdngramMerge<3>(swap, out, para_offsets);
211
printf("Done\n"); fflush(stdout);