~snowball-yiddish-dev/snowball-yiddish/trunk

« back to all changes in this revision

Viewing changes to snowball/compiler/tokeniser.c

Committer: Jason Spashett
Date: 2012-04-14 13:12:57 UTC
Revision ID: jason@spashett.com-20120414131257-rv3ugy4u2iyoczdk

Add ISO 639-2, and 639-1 language codes

files added:
data

data/danish

data/danish/diffs.txt

data/danish/output.txt

data/danish/voc.txt

data/dutch

data/dutch/diffs.txt

data/dutch/output.txt

data/dutch/voc.txt

data/english

data/english/diffs.txt

data/english/output.txt

data/english/voc.txt

data/finnish

data/finnish/diffs.txt

data/finnish/output.txt

data/finnish/voc.txt

data/french

data/french/diffs.txt

data/french/output.txt

data/french/voc.txt

data/german

data/german/diffs.txt

data/german/output.txt

data/german/voc.txt

data/german2

data/german2/output.txt

data/german2/voc.txt

data/hungarian

data/hungarian/diffs.txt

data/hungarian/output.txt

data/hungarian/voc.txt

data/italian

data/italian/diffs.txt

data/italian/output.txt

data/italian/voc.txt

data/kraaij_pohlmann

data/kraaij_pohlmann/diffs.txt

data/kraaij_pohlmann/output.txt

data/kraaij_pohlmann/voc.txt

data/lovins

data/lovins/output.txt

data/lovins/voc.txt

data/norwegian

data/norwegian/diffs.txt

data/norwegian/output.txt

data/norwegian/voc.txt

data/porter

data/porter/diffs.txt

data/porter/output.txt

data/porter/voc.txt

data/portuguese

data/portuguese/diffs.txt

data/portuguese/output.txt

data/portuguese/voc.txt

data/romanian

data/romanian/diffs.txt

data/romanian/output.txt

data/romanian/voc.txt

data/russian

data/russian/diffs-t.txt

data/russian/diffs.txt

data/russian/output.txt

data/russian/voc.txt

data/spanish

data/spanish/diffs.txt

data/spanish/output.txt

data/spanish/voc.txt

data/swedish

data/swedish/diffs.txt

data/swedish/output.txt

data/swedish/voc.txt

data/turkish

data/turkish/output.txt

data/turkish/voc.txt

pystemmer

pystemmer/ChangeLog

pystemmer/HACKING

pystemmer/LICENSE

pystemmer/MANIFEST.in

pystemmer/README

pystemmer/benchmark.py

pystemmer/docs

pystemmer/docs/quickstart.txt

pystemmer/docs/quickstart_python3.txt

pystemmer/makedist.sh

pystemmer/runtests.py

pystemmer/sampledata

pystemmer/sampledata/englishvoc.txt

pystemmer/sampledata/puttydoc.txt

pystemmer/setup.py

pystemmer/src

pystemmer/src/Stemmer.pyx

scripts

scripts/bootstrap.sh

scripts/checkdata.sh

scripts/make_website.sh

snowball/AUTHORS

snowball/algorithms/danish/stem_ISO_8859_1.sbl

snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl

snowball/algorithms/dutch/stem_ISO_8859_1.sbl

snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl

snowball/algorithms/english/stem_ISO_8859_1.sbl

snowball/algorithms/finnish/stem_ISO_8859_1.sbl

snowball/algorithms/french/stem_ISO_8859_1.sbl

snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl

snowball/algorithms/german/stem_ISO_8859_1.sbl

snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl

snowball/algorithms/german2/stem_ISO_8859_1.sbl

snowball/algorithms/hungarian

snowball/algorithms/hungarian/stem_ISO_8859_1.sbl

snowball/algorithms/italian/stem_ISO_8859_1.sbl

snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl

snowball/algorithms/kraaij_pohlmann

snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl

snowball/algorithms/lovins/stem_ISO_8859_1.sbl

snowball/algorithms/norwegian/stem_ISO_8859_1.sbl

snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl

snowball/algorithms/porter/stem_ISO_8859_1.sbl

snowball/algorithms/portuguese/stem_ISO_8859_1.sbl

snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl

snowball/algorithms/romanian

snowball/algorithms/romanian/stem_ISO_8859_2.sbl

snowball/algorithms/romanian/stem_Unicode.sbl

snowball/algorithms/russian/stem_KOI8_R.sbl

snowball/algorithms/russian/stem_Unicode.sbl

snowball/algorithms/spanish/stem_ISO_8859_1.sbl

snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl

snowball/algorithms/swedish/stem_ISO_8859_1.sbl

snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl

snowball/algorithms/turkish

snowball/algorithms/turkish/stem_Unicode.sbl

snowball/algorithms/yiddish

snowball/algorithms/yiddish/stem_Unicode.sbl

snowball/compiler/syswords.h

snowball/compiler/syswords2.h

snowball/doc/libstemmer_c_README

snowball/doc/libstemmer_java_README

snowball/java

snowball/java/org

snowball/java/org/tartarus

snowball/java/org/tartarus/snowball

snowball/java/org/tartarus/snowball/Among.java

snowball/java/org/tartarus/snowball/SnowballProgram.java

snowball/java/org/tartarus/snowball/SnowballStemmer.java

snowball/java/org/tartarus/snowball/TestApp.java

snowball/libstemmer/libstemmer_c.in

snowball/libstemmer/modules.txt

snowball/libstemmer/modules_utf8.txt

website/S

website/S/index.php

website/algorithms

website/algorithms/armenian

website/algorithms/armenian/stemmer.html

website/algorithms/armenian/stemmer.java

website/algorithms/armenian/stemmer.sbl

website/algorithms/armenian/tarball.tgz

website/algorithms/basque

website/algorithms/basque/stemmer.html

website/algorithms/basque/tarball.tgz

website/algorithms/catalan

website/algorithms/catalan/stemmer.html

website/algorithms/catalan/tarball.tgz

website/algorithms/danish

website/algorithms/danish/stemmer.html

website/algorithms/danish/stop.txt

website/algorithms/dutch

website/algorithms/dutch/stemmer.html

website/algorithms/dutch/stop.txt

website/algorithms/english

website/algorithms/english/stemmer.html

website/algorithms/english/stop.txt

website/algorithms/finnish

website/algorithms/finnish/stemmer.html

website/algorithms/finnish/stop.txt

website/algorithms/french

website/algorithms/french/stemmer.html

website/algorithms/french/stop.txt

website/algorithms/german

website/algorithms/german/stemmer.html

website/algorithms/german/stop.txt

website/algorithms/german2

website/algorithms/german2/stemmer.html

website/algorithms/hungarian

website/algorithms/hungarian/stemmer.html

website/algorithms/hungarian/stop.txt

website/algorithms/italian

website/algorithms/italian/stemmer.html

website/algorithms/italian/stop.txt

website/algorithms/kraaij_pohlmann

website/algorithms/kraaij_pohlmann/stemmer.html

website/algorithms/lovins

website/algorithms/lovins/festschrift.html

website/algorithms/lovins/porter-1.jpg

website/algorithms/lovins/stemmer.html

website/algorithms/norwegian

website/algorithms/norwegian/stemmer.html

website/algorithms/norwegian/stop.txt

website/algorithms/porter

website/algorithms/porter/stemmer.html

website/algorithms/portuguese

website/algorithms/portuguese/stemmer.html

website/algorithms/portuguese/stop.txt

website/algorithms/romanian

website/algorithms/romanian/stemmer.html

website/algorithms/russian

website/algorithms/russian/stemmer.html

website/algorithms/russian/stop.txt

website/algorithms/spanish

website/algorithms/spanish/stemmer.html

website/algorithms/spanish/stop.txt

website/algorithms/swedish

website/algorithms/swedish/stemmer.html

website/algorithms/swedish/stop.txt

website/algorithms/turkish

website/algorithms/turkish/accompanying_paper.doc

website/algorithms/turkish/stemmer.html

website/compiler

website/compiler/snowman.html

website/contrib

website/contrib/PySnowballStemmer-0.0.1.tar.gz

website/otherapps

website/otherapps/pascal

website/otherapps/pascal/intro.html

website/otherapps/pascal/stemming.zip

website/otherapps/romanian

website/otherapps/romanian/intro.html

website/otherapps/romanian/romanian1.tgz

website/otherapps/romanian/romanian2.tgz

website/otherapps/schinke

website/otherapps/schinke/intro.html

website/otherapps/schinke/schinke.tgz

website/otherlangs

website/otherlangs/english_c.txt

website/otherlangs/english_cpp.txt

website/otherlangs/english_erl.txt

website/otherlangs/french_javascript.txt

website/otherlangs/german_javascript.txt

website/otherlangs/german_py.txt

website/otherlangs/index.html

website/otherlangs/italian_csharp.txt

website/otherlangs/portuguese_java.txt

website/otherlangs/russian_php5.txt

website/otherlangs/urim_c.txt

website/otherlangs/urim_javascript.txt

website/robots.txt

website/runtime

website/runtime/use.html

website/snub-dodecahedron.gif

website/texts/apostrophe.html

website/texts/earlyenglish.html

website/wrappers/PyStemmer-1.0.1.tar.gz

website/wrappers/PyStemmer-1.0.tar.gz

website/wrappers/PyStemmer-1.1.0.tar.gz

website/wrappers/PyStemmer-1.2.0.tar.gz

website/wrappers/perl.tgz

files removed:
snowball/.cvsignore

snowball/algorithms/danish/stem.sbl

snowball/algorithms/dutch/stem.sbl

snowball/algorithms/english/stem.sbl

snowball/algorithms/finnish/stem.sbl

snowball/algorithms/french/stem.sbl

snowball/algorithms/german/stem.sbl

snowball/algorithms/german2/stem.sbl

snowball/algorithms/italian/stem.sbl

snowball/algorithms/lovins/stem.sbl

snowball/algorithms/norwegian/stem.sbl

snowball/algorithms/porter/stem.sbl

snowball/algorithms/portuguese/stem.sbl

snowball/algorithms/russian/stem.sbl

snowball/algorithms/spanish/stem.sbl

snowball/algorithms/swedish/stem.sbl

snowball/compiler/sort.c

snowball/compiler/syswords

snowball/compiler/syswords2

snowball/libstemmer/.cvsignore

snowball/libstemmer/libstemmer.c

website/.cvsignore

website/Makefile

website/danish

website/danish/.cvsignore

website/danish/diffs.txt

website/danish/output.txt

website/danish/stem-MS-DOS-Latin-I.sbl

website/danish/stem.sbl

website/danish/stemmer.html

website/danish/stop.txt

website/danish/voc.txt

website/dutch

website/dutch/.cvsignore

website/dutch/diffs.txt

website/dutch/output.txt

website/dutch/stem-MS-DOS-Latin-I.sbl

website/dutch/stem.sbl

website/dutch/stemmer.html

website/dutch/stop.txt

website/dutch/voc.txt

website/english

website/english/.cvsignore

website/english/diffs.txt

website/english/output.txt

website/english/stem.sbl

website/english/stemmer.html

website/english/stop.txt

website/english/voc.txt

website/finnish

website/finnish/diffs.txt

website/finnish/output.txt

website/finnish/stem.sbl

website/finnish/stemmer.html

website/finnish/voc.txt

website/french

website/french/.cvsignore

website/french/diffs.txt

website/french/output.txt

website/french/stem-MS-DOS-Latin-I.sbl

website/french/stem.sbl

website/french/stemmer.html

website/french/stop.txt

website/french/voc.txt

website/german

website/german/.cvsignore

website/german/diffs.txt

website/german/output.txt

website/german/stem-MS-DOS-Latin-I.sbl

website/german/stem.sbl

website/german/stemmer.html

website/german/stop.txt

website/german/voc.txt

website/german2

website/german2/stem.c

website/german2/stem.h

website/german2/stem.sbl

website/german2/stemmer.html

website/italian

website/italian/.cvsignore

website/italian/diffs.txt

website/italian/output.txt

website/italian/stem-MS-DOS-Latin-I.sbl

website/italian/stem.sbl

website/italian/stemmer.html

website/italian/stop.txt

website/italian/voc.txt

website/kp

website/kp/D.txt

website/kp/stem.c

website/kp/stem.h

website/kp/stem.sbl

website/kp/stemmer.html

website/libstemmer

website/libstemmer/.cvsignore

website/libstemmer/libstemmer.h

website/libstemmer/wrapper.c

website/lovins

website/lovins/stem.c

website/lovins/stem.h

website/lovins/stem.sbl

website/lovins/stemmer.html

website/net

website/net/sf

website/net/sf/snowball

website/net/sf/snowball/Among.java

website/net/sf/snowball/SnowballProgram.java

website/net/sf/snowball/TestApp.java

website/norwegian

website/norwegian/.cvsignore

website/norwegian/diffs.txt

website/norwegian/output.txt

website/norwegian/stem-MS-DOS-Latin-I.sbl

website/norwegian/stem.sbl

website/norwegian/stemmer.html

website/norwegian/stop.txt

website/norwegian/voc.txt

website/p

website/p/analyser.c

website/p/driver.c

website/p/generator.c

website/p/generator_java.c

website/p/header.h

website/p/make

website/p/snowman.html

website/p/sort.c

website/p/space.c

website/p/syswords

website/p/syswords2

website/p/tokeniser.c

website/porter

website/porter/.cvsignore

website/porter/diffs.txt

website/porter/output.txt

website/porter/stem.sbl

website/porter/stemmer.html

website/porter/voc.txt

website/portuguese

website/portuguese/.cvsignore

website/portuguese/diffs.txt

website/portuguese/output.txt

website/portuguese/stem-MS-DOS-Latin-I.sbl

website/portuguese/stem.sbl

website/portuguese/stemmer.html

website/portuguese/stop.txt

website/portuguese/voc.txt

website/q

website/q/api.c

website/q/api.h

website/q/driver-porter.c

website/q/driver.c

website/q/driver.template

website/q/header.h

website/q/make

website/q/use.html

website/q/utilities.c

website/russian

website/russian/.cvsignore

website/russian/diffs.txt

website/russian/output.txt

website/russian/stem.sbl

website/russian/stemmer.html

website/russian/stop.txt

website/russian/voc.txt

website/snub-dodecahedron.gif

website/spanish

website/spanish/.cvsignore

website/spanish/diffs.txt

website/spanish/output.txt

website/spanish/stem-MS-DOS-Latin-I.sbl

website/spanish/stem.sbl

website/spanish/stemmer.html

website/spanish/stop.txt

website/spanish/voc.txt

website/swedish

website/swedish/.cvsignore

website/swedish/diffs.txt

website/swedish/output.txt

website/swedish/stem-MS-DOS-Latin-I.sbl

website/swedish/stem.sbl

website/swedish/stemmer.html

website/swedish/stop.txt

website/swedish/voc.txt

website/texts/snowball.tgz

website/wrappers/perl.tgz

files modified:
MODULE

snowball/GNUmakefile

snowball/README

snowball/compiler/analyser.c

snowball/compiler/driver.c

snowball/compiler/generator.c

snowball/compiler/generator_java.c

snowball/compiler/header.h

snowball/compiler/space.c

snowball/compiler/tokeniser.c

snowball/doc/TODO

snowball/examples/stemwords.c

snowball/include/libstemmer.h

snowball/libstemmer/mkmodules.pl

snowball/runtime/api.c

snowball/runtime/api.h

snowball/runtime/header.h

snowball/runtime/utilities.c

website/buglist.txt

website/codesets/guide.html

website/credits.php

website/demo.php

website/download.php

website/index.php

website/index_body.html

website/lists.php

website/menu.inc

website/projects.php

website/texts/germanic.html

website/texts/howtohelp.html

website/texts/introduction.html

website/texts/quickintro.html

website/texts/r1r2.html

website/texts/romance.html

website/texts/scandinavian.html

website/texts/stemmersoverview.html

website/wrappers/guide.html

Show diffs side-by-side

added added

removed removed

snowball/compiler/tokeniser.c

#include <ctype.h> /* isalpha etc */

#include "header.h"

struct system_word

{ int s_size; /* size of system word */

struct system_word {

int s_size; /* size of system word */

byte * s; /* pointer to the system word */

int code; /* it's internal code */

};

/* ASCII collating assumed in syswords.c */

#include "syswords"

#include "syswords.h"

static int smaller(int a, int b) { return a < b ? a : b; }

extern symbol * get_input(symbol * p)

{

extern symbol * get_input(symbol * p, char ** p_file) {

char * s = b_to_s(p);

{ FILE * input = fopen(s, "r");

free(s);

if (input == 0) return 0;

{ symbol * u = create_b(STARTSIZE);

{

FILE * input = fopen(s, "r");

if (input == 0) { free(s); return 0; }

*p_file = s;

{

symbol * u = create_b(STARTSIZE);

int size = 0;

repeat

{ int ch = getc(input);

}

static void error(struct tokeniser * t, char * s1, int n, symbol * p, char * s2)

{

static void error(struct tokeniser * t, char * s1, int n, symbol * p, char * s2) {

if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); }

fprintf(stderr, "Line %d", t->line_number);

if (t->get_depth > 0) fprintf(stderr, " (of included file)");

fprintf(stderr, ": ");

fprintf(stderr, "%s:%d: ", t->file, t->line_number);

unless (s1 == 0) fprintf(stderr, "%s", s1);

unless (p == 0)

{ int i;

unless (p == 0) {

int i;

for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]);

}

unless (s2 == 0) fprintf(stderr, "%s", s2);

t->error_count++;

}

static void error1(struct tokeniser * t, char * s)

{ error(t, s, 0,0, 0); }

static void error2(struct tokeniser * t, char * s)

{ error(t, "unexpected end of text after ", 0,0, s); }

static int compare_words(int m, symbol * p, int n, byte * q)

{ unless (m == n) return m - n;

{ int i; for (i = 0; i < n; i++)

{ int diff = p[i] - q[i];

static void error1(struct tokeniser * t, char * s) {

error(t, s, 0,0, 0);

}

static void error2(struct tokeniser * t, char * s) {

error(t, "unexpected end of text after ", 0,0, s);

}

static int compare_words(int m, symbol * p, int n, byte * q) {

unless (m == n) return m - n;

{

int i; for (i = 0; i < n; i++) {

int diff = p[i] - q[i];

unless (diff == 0) return diff;

}

return 0;

}

static int find_word(int n, symbol * p)

{ int i = 0; int j = vocab->code;

repeat

{ int k = i + (j - i)/2;

static int find_word(int n, symbol * p) {

int i = 0; int j = vocab->code;

repeat {

int k = i + (j - i)/2;

struct system_word * w = vocab + k;

int diff = compare_words(n, p, w->s_size, w->s);

if (diff == 0) return w->code;

return -1;

}

static int get_number(int n, symbol * p)

{ int x = 0;

static int get_number(int n, symbol * p) {

int x = 0;

int i; for (i = 0; i < n; i++) x = 10*x + p[i] - '0';

return x;

}

static int eq_s(struct tokeniser * t, char * s)

{ int l = strlen(s);

static int eq_s(struct tokeniser * t, char * s) {

int l = strlen(s);

if (SIZE(t->p) - t->c < l) return false;

{ int i;

{

int i;

for (i = 0; i < l; i++) if (t->p[t->c + i] != s[i]) return false;

100

}

101

t->c += l; return true;

102

}

100

103

101

static int white_space(struct tokeniser * t, int ch)

102

{ switch (ch)

103

{ case '\n': t->line_number++;

104

static int white_space(struct tokeniser * t, int ch) {

105

switch (ch) {

106

case '\n': t->line_number++;

104

107

case '\r':

105

108

case '\t':

106

109

case ' ': return true;

108

111

return false;

109

112

}

110

113

111

static symbol * find_in_m(struct tokeniser * t, int n, symbol * p)

112

{ struct m_pair * q = t->m_pairs;

113

repeat

114

{ if (q == 0) return 0;

115

{ symbol * name = q->name;

114

static symbol * find_in_m(struct tokeniser * t, int n, symbol * p) {

115

struct m_pair * q = t->m_pairs;

116

repeat {

117

if (q == 0) return 0;

118

{

119

symbol * name = q->name;

116

120

if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value;

117

121

}

118

122

q = q->next;

119

123

}

120

124

}

121

125

122

static int read_literal_string(struct tokeniser * t, int c)

123

{ symbol * p = t->p;

126

static int read_literal_string(struct tokeniser * t, int c) {

127

symbol * p = t->p;

124

128

int ch;

125

129

SIZE(t->b) = 0;

126

repeat

127

{ if (c >= SIZE(p)) { error2(t, "'"); return c; }

130

repeat {

131

if (c >= SIZE(p)) { error2(t, "'"); return c; }

128

132

ch = p[c];

129

133

if (ch == '\n') { error1(t, "string not terminated"); return c; }

130

134

c++;

131

if (ch == t->m_start)

132

{ int c0 = c;

135

if (ch == t->m_start) {

136

int c0 = c;

133

137

int newlines = false; /* no newlines as yet */

134

138

int black_found = false; /* no printing chars as yet */

135

repeat

136

{ if (c >= SIZE(p)) { error2(t, "'"); return c; }

139

repeat {

140

if (c >= SIZE(p)) { error2(t, "'"); return c; }

137

141

ch = p[c]; c++;

138

142

if (ch == t->m_end) break;

139

143

unless (white_space(t, ch)) black_found = true;

140

144

if (ch == '\n') newlines = true;

141

{ if (newlines && black_found)

142

{ error1(t, "string not terminated");

143

return c;

144

}

145

if (newlines && black_found) {

146

error1(t, "string not terminated");

147

return c;

145

148

}

146

149

}

147

unless (newlines)

148

{

150

unless (newlines) {

149

151

int n = c - c0 - 1; /* macro size */

150

152

int firstch = p[c0];

151

153

symbol * q = find_in_m(t, n, p + c0);

152

if (q == 0)

153

{ if (n == 1 && (firstch == '\'' || firstch == t->m_start))

154

if (q == 0) {

155

if (n == 1 && (firstch == '\'' || firstch == t->m_start))

154

156

t->b = add_to_b(t->b, 1, p + c0);

155

157

else

156

158

error(t, "string macro '", n, p + c0, "' undeclared");

157

}

158

else

159

} else

159

160

t->b = add_to_b(t->b, SIZE(q), q);

160

161

}

161

} else

162

{ if (ch == '\'') return c;

162

} else {

163

if (ch == '\'') return c;

163

164

t->b = add_to_b(t->b, 1, p + c - 1);

164

165

}

165

166

}

166

167

}

167

168

static int next_token(struct tokeniser * t)

169

{ symbol * p = t->p;

169

static int next_token(struct tokeniser * t) {

170

symbol * p = t->p;

170

171

int c = t->c;

171

172

int ch;

172

173

int code = -1;

173

repeat

174

{ if (c >= SIZE(p)) { t->c = c; return -1; }

174

repeat {

175

if (c >= SIZE(p)) { t->c = c; return -1; }

175

176

ch = p[c];

176

177

if (white_space(t, ch)) { c++; continue; }

177

if (isalpha(ch))

178

{ int c0 = c;

178

if (isalpha(ch)) {

179

int c0 = c;

179

180

while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++;

180

181

code = find_word(c - c0, p + c0);

181

if (code < 0)

182

{ t->b = move_to_b(t->b, c - c0, p + c0);

182

if (code < 0) {

183

t->b = move_to_b(t->b, c - c0, p + c0);

183

184

code = c_name;

184

185

}

185

186

} else

186

if (isdigit(ch))

187

{ int c0 = c;

187

if (isdigit(ch)) {

188

int c0 = c;

188

189

while (c < SIZE(p) && isdigit(p[c])) c++;

189

190

t->number = get_number(c - c0, p + c0);

190

191

code = c_number;

191

192

} else

192

if (ch == '\'')

193

{ c = read_literal_string(t, c + 1);

193

if (ch == '\'') {

194

c = read_literal_string(t, c + 1);

194

195

code = c_literalstring;

195

196

} else

196

{ int lim = smaller(2, SIZE(p) - c);

197

int i; for (i = lim; i > 0; i--)

198

{ code = find_word(i, p + c);

197

{

198

int lim = smaller(2, SIZE(p) - c);

199

int i;

200

for (i = lim; i > 0; i--) {

201

code = find_word(i, p + c);

199

202

if (code >= 0) { c += i; break; }

200

203

}

201

204

}

202

if (code >= 0)

203

{ t->c = c;

205

if (code >= 0) {

206

t->c = c;

204

207

return code;

205

208

}

206

209

error(t, "'", 1, p + c, "' unknown");

209

212

}

210

213

}

211

214

212

static int next_char(struct tokeniser * t)

213

{ if (t->c >= SIZE(t->p)) return -1;

215

static int next_char(struct tokeniser * t) {

216

if (t->c >= SIZE(t->p)) return -1;

214

217

return t->p[t->c++];

215

218

}

216

219

217

static int next_real_char(struct tokeniser * t)

218

{ repeat

219

{ int ch = next_char(t);

220

static int next_real_char(struct tokeniser * t) {

221

repeat {

222

int ch = next_char(t);

220

223

if (white_space(t, ch)) continue;

221

224

return ch;

222

225

}

223

226

}

224

227

225

static void read_chars(struct tokeniser * t)

226

{ int ch = next_real_char(t);

227

if (ch < 0)

228

{ error2(t, "stringdef"); return; }

229

{ int c0 = t->c-1;

230

repeat

231

{ ch = next_char(t);

228

static void read_chars(struct tokeniser * t) {

229

int ch = next_real_char(t);

230

if (ch < 0) { error2(t, "stringdef"); return; }

231

{

232

int c0 = t->c-1;

233

repeat {

234

ch = next_char(t);

232

235

if (white_space(t, ch) || ch < 0) break;

233

236

}

234

237

t->b2 = move_to_b(t->b2, t->c - c0 - 1, t->p + c0);

235

238

}

236

239

}

237

240

238

static int decimal_to_num(int ch)

239

{ if ('0' <= ch && ch <= '9') return ch - '0';

241

static int decimal_to_num(int ch) {

242

if ('0' <= ch && ch <= '9') return ch - '0';

240

243

return -1;

241

244

}

242

245

243

static int hex_to_num(int ch)

244

{ if ('0' <= ch && ch <= '9') return ch - '0';

246

static int hex_to_num(int ch) {

247

if ('0' <= ch && ch <= '9') return ch - '0';

245

248

if ('a' <= ch && ch <= 'f') return ch - 'a' + 10;

246

249

return -1;

247

250

}

248

251

249

static void convert_numeric_string(struct tokeniser * t, symbol * p, int base)

250

{

252

static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {

251

253

int c = 0; int d = 0;

252

repeat

253

{ while (c < SIZE(p) && p[c] == ' ') c++;

254

repeat {

255

while (c < SIZE(p) && p[c] == ' ') c++;

254

256

if (c == SIZE(p)) break;

255

{ int number = 0;

256

repeat

257

{ int ch = p[c];

257

{

258

int number = 0;

259

repeat {

260

int ch = p[c];

258

261

if (c == SIZE(p) || ch == ' ') break;

259

if (base == 10)

260

{ ch = decimal_to_num(ch);

261

if (ch < 0)

262

{ error1(t, "decimal string contains non-digits");

262

if (base == 10) {

263

ch = decimal_to_num(ch);

264

if (ch < 0) {

265

error1(t, "decimal string contains non-digits");

263

266

return;

264

267

}

265

} else

266

{ ch = hex_to_num(tolower(ch));

267

if (ch < 0)

268

{ error1(t, "hex string contains non-hex characters");

268

} else {

269

ch = hex_to_num(tolower(ch));

270

if (ch < 0) {

271

error1(t, "hex string contains non-hex characters");

269

272

return;

270

273

}

271

274

}

272

275

number = base * number + ch;

273

276

c++;

274

277

}

275

if (t->widechars) unless (0 <= number && number <= 0xffff)

276

{ error1(t, "character values exceed 64K");

277

return;

278

}

279

unless (t->widechars) unless (0 <= number && number <= 0xff)

280

{ error1(t, "character values exceed 256");

281

return;

282

}

283

p[d++] = number;

278

if (t->widechars || t->utf8) {

279

unless (0 <= number && number <= 0xffff) {

280

error1(t, "character values exceed 64K");

281

return;

282

}

283

} else {

284

unless (0 <= number && number <= 0xff) {

285

error1(t, "character values exceed 256");

286

return;

287

}

288

}

289

if (t->utf8)

290

d += put_utf8(number, p + d);

291

else

292

p[d++] = number;

284

293

}

285

294

}

286

295

SIZE(p) = d;

287

296

}

288

297

289

extern int read_token(struct tokeniser * t)

290

{ symbol * p = t->p;

298

extern int read_token(struct tokeniser * t) {

299

symbol * p = t->p;

291

300

int held = t->token_held;

292

301

t->token_held = false;

293

302

if (held) return t->token;

294

repeat

295

{ int code = next_token(t);

296

switch (code)

297

{ case c_comment1: /* slash-slash comment */

303

repeat {

304

int code = next_token(t);

305

switch (code) {

306

case c_comment1: /* slash-slash comment */

298

307

while (t->c < SIZE(p) && p[t->c] != '\n') t->c++;

299

308

continue;

300

309

case c_comment2: /* slash-star comment */

301

repeat

302

{ if (t->c >= SIZE(p))

303

{ error1(t, "/* comment not terminated");

310

repeat {

311

if (t->c >= SIZE(p)) {

312

error1(t, "/* comment not terminated");

304

313

t->token = -1;

305

314

return -1;

306

315

}

310

319

}

311

320

continue;

312

321

case c_stringescapes:

313

{ int ch1 = next_real_char(t);

322

{

323

int ch1 = next_real_char(t);

314

324

int ch2 = next_real_char(t);

315

325

if (ch2 < 0)

316

326

{ error2(t, "stringescapes"); continue; }

321

331

}

322

332

continue;

323

333

case c_stringdef:

324

{ int base = 0;

334

{

335

int base = 0;

325

336

read_chars(t);

326

337

code = read_token(t);

327

338

if (code == c_hex) { base = 16; code = read_token(t); } else

338

349

}

339

350

continue;

340

351

case c_get:

341

{ code = read_token(t);

342

unless (code == c_literalstring)

343

{ error1(t, "string omitted after get"); continue; }

344

t->get_depth++;

345

if (t->get_depth > 10)

346

{ fprintf(stderr, "get directives go 10 deep. Looping?\n");

352

code = read_token(t);

353

unless (code == c_literalstring) {

354

error1(t, "string omitted after get"); continue;

355

}

356

t->get_depth++;

357

if (t->get_depth > 10) {

358

fprintf(stderr, "get directives go 10 deep. Looping?\n");

359

exit(1);

360

}

361

{

362

char * file;

363

NEW(input, q);

364

symbol * u = get_input(t->b, &file);

365

if (u == 0) {

366

struct include * r = t->includes;

367

until (r == 0) {

368

symbol * b = copy_b(r->b);

369

b = add_to_b(b, SIZE(t->b), t->b);

370

u = get_input(b, &file);

371

lose_b(b);

372

unless (u == 0) break;

373

r = r->next;

374

}

375

}

376

if (u == 0) {

377

error(t, "Can't get '", SIZE(t->b), t->b, "'");

347

378

exit(1);

348

379

}

349

{ NEW(input, q);

350

symbol * u = get_input(t->b);

351

if (u == 0)

352

{ struct include * r = t->includes;

353

until (r == 0)

354

{ symbol * b = copy_b(r->b);

355

b = add_to_b(b, SIZE(t->b), t->b);

356

u = get_input(b);

357

lose_b(b);

358

unless (u == 0) break;

359

r = r->next;

360

}

361

}

362

if (u == 0)

363

{ error(t, "Can't get '", SIZE(t->b), t->b, "'");

364

exit(1);

365

}

366

memmove(q, t, sizeof(struct input));

367

t->next = q;

368

t->p = u;

369

t->c = 0;

370

t->line_number = 1;

371

}

372

p = t->p;

373

continue;

380

memmove(q, t, sizeof(struct input));

381

t->next = q;

382

t->p = u;

383

t->c = 0;

384

t->file = file;

385

t->line_number = 1;

374

386

}

387

p = t->p;

388

continue;

375

389

case -1:

376

unless (t->next == 0)

377

{ lose_b(p);

378

{ struct input * q = t->next;

390

unless (t->next == 0) {

391

lose_b(p);

392

{

393

struct input * q = t->next;

379

394

memmove(t, q, sizeof(struct input)); p = t->p;

380

395

FREE(q);

381

396

}

391

406

}

392

407

}

393

408

394

extern byte * name_of_token(int code)

395

{ int i;

409

extern byte * name_of_token(int code) {

410

int i;

396

411

for (i = 1; i < vocab->code; i++)

397

412

if ((vocab + i)->code == code) return (vocab + i)->s;

398

switch (code)

399

{

413

switch (code) {

400

414

case c_mathassign: return (byte *) "=";

401

415

case c_name: return (byte *) "name";

402

416

case c_number: return (byte *) "number";

411

425

}

412

426

}

413

427

414

extern struct tokeniser * create_tokeniser(symbol * p)

415

{ NEW(tokeniser, t);

428

extern struct tokeniser * create_tokeniser(symbol * p, char * file) {

429

NEW(tokeniser, t);

416

430

t->next = 0;

417

431

t->p = p;

418

432

t->c = 0;

433

t->file = file;

419

434

t->line_number = 1;

420

435

t->b = create_b(0);

421

436

t->b2 = create_b(0);

429

444

return t;

430

445

}

431

446

432

extern void close_tokeniser(struct tokeniser * t)

433

{ lose_b(t->b);

447

extern void close_tokeniser(struct tokeniser * t) {

448

lose_b(t->b);

434

449

lose_b(t->b2);

435

{ struct m_pair * q = t->m_pairs;

436

until (q == 0)

437

{ struct m_pair * q_next = q->next;

450

{

451

struct m_pair * q = t->m_pairs;

452

until (q == 0) {

453

struct m_pair * q_next = q->next;

438

454

lose_b(q->name);

439

455

lose_b(q->value);

440

456

FREE(q);

441

457

q = q_next;

442

458

}

443

459

}

444

{ struct input * q = t->next;

445

until (q == 0)

446

{ struct input * q_next = q->next;

460

{

461

struct input * q = t->next;

462

until (q == 0) {

463

struct input * q_next = q->next;

447

464

FREE(q);

448

465

q = q_next;

449

466

}

450

467

}

468

free(t->file);

451

469

FREE(t);

452

470

}

453

Older »