1
/* This is a simple program which uses libstemmer to provide a command
2
* line interface for stemming using any of the algorithms provided.
6
#include <stdlib.h> /* for malloc, free */
7
#include <string.h> /* for memmove */
8
#include <ctype.h> /* for isupper, tolower */
10
#include "libstemmer.h"
12
const char * progname;
13
static int pretty = 1;
16
stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
20
sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol));
31
if (ch == '\n' || ch == EOF) break;
35
realloc(b, (lim + INC) * sizeof(sb_symbol));
36
if (newb == 0) goto error;
40
/* Update count of utf-8 characters. */
41
if (ch < 0x80 || ch > 0xBF) inlen += 1;
42
/* force lower case: */
43
if (isupper(ch)) ch = tolower(ch);
51
const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i);
54
fprintf(stderr, "Out of memory");
60
fwrite(b, i, 1, f_out);
62
} else if (pretty == 2) {
63
fwrite(b, i, 1, f_out);
64
if (sb_stemmer_length(stemmer) > 0) {
67
for (j = 30 - inlen; j > 0; j--)
71
for (j = 30; j > 0; j--)
77
fputs((char *)stemmed, f_out);
88
/** Display the command line syntax, and then exit.
89
* @param n The value to exit with.
94
printf("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n"
96
"The input file consists of a list of words to be stemmed, one per\n"
97
"line. Words should be in lower case, but (for English) A-Z letters\n"
98
"are mapped to their a-z equivalents anyway. If omitted, stdin is\n"
101
"If -c is given, the argument is the character encoding of the input\n"
102
"and output files. If it is omitted, the UTF-8 encoding is used.\n"
104
"If -p is given the output file consists of each word of the input\n"
105
"file followed by \"->\" followed by its stemmed equivalent.\n"
106
"If -p2 is given the output file is a two column layout containing\n"
107
"the input words in the first column and the stemmed eqivalents in\n"
108
"the second column.\n"
109
"Otherwise, the output file consists of the stemmed words, one per\n"
112
"-h displays this help\n",
118
main(int argc, char * argv[])
124
struct sb_stemmer * stemmer;
126
char * language = "english";
127
char * charenc = NULL;
138
if (strcmp(s, "-o") == 0) {
140
fprintf(stderr, "%s requires an argument\n", s);
144
} else if (strcmp(s, "-i") == 0) {
146
fprintf(stderr, "%s requires an argument\n", s);
150
} else if (strcmp(s, "-l") == 0) {
152
fprintf(stderr, "%s requires an argument\n", s);
155
language = argv[i++];
156
} else if (strcmp(s, "-c") == 0) {
158
fprintf(stderr, "%s requires an argument\n", s);
162
} else if (strcmp(s, "-p2") == 0) {
164
} else if (strcmp(s, "-p") == 0) {
166
} else if (strcmp(s, "-h") == 0) {
169
fprintf(stderr, "option %s unknown\n", s);
173
fprintf(stderr, "unexpected parameter %s\n", s);
178
/* prepare the files */
179
f_in = (in == 0) ? stdin : fopen(in, "r");
181
fprintf(stderr, "file %s not found\n", in);
184
f_out = (out == 0) ? stdout : fopen(out, "w");
186
fprintf(stderr, "file %s cannot be opened\n", out);
190
/* do the stemming process: */
191
stemmer = sb_stemmer_new(language, charenc);
193
if (charenc == NULL) {
194
fprintf(stderr, "language `%s' not available for stemming\n", language);
197
fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc);
201
stem_file(stemmer, f_in, f_out);
202
sb_stemmer_delete(stemmer);
204
if (in != 0) (void) fclose(f_in);
205
if (out != 0) (void) fclose(f_out);