~vcs-imports/gawk/master

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
BEGIN {
	false = 0
	true = 1

	fpat[1] = "([^,]*)|(\"[^\"]+\")"
	fpat[2] = fpat[1]
	fpat[3] = fpat[1]
	fpat[4] = "aa+"
	fpat[5] = fpat[4]
	fpat[6] = "[a-z]"

	data[1] = "Robbins,,Arnold,"
	data[2] = "Smith,,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
	data[3] = "Robbins,Arnold,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
	data[4] = "bbbaaacccdddaaaaaqqqq"
	data[5] = "bbbaaacccdddaaaaaqqqqa" # should get trailing qqqa
	data[6] = "aAbBcC"

	for (i = 1; i in data; i++) {
		printf("Splitting: <%s>\n", data[i])
		n = mypatsplit(data[i], fields, fpat[i], seps)
		m = patsplit(data[i], fields2, fpat[i], seps2)
		print "n =", n, "m =", m
		if (n != m) {
			printf("ERROR: counts wrong!\n") > "/dev/stderr"
			exit 1
		}
		for (j = 1; j <= n; j++) {
			printf("fields[%d] = <%s>\tfields2[%d] = <%s>\n", j, fields[j], j, fields2[j])
			if (fields[j] != fields2[j]) {
				printf("ERROR: data %d, field %d mismatch!\n", i, j) > "/dev/stderr"
				exit 1
			}
		}
		for (j = 0; j in seps; j++) {
			printf("seps[%d] = <%s>\tseps2[%d] = <%s>\n", j, seps[j], j, seps2[j])
			if (seps[j] != seps2[j]) {
				printf("ERROR: data %d, separator %d mismatch!\n", i, j) > "/dev/stderr"
				exit 1
			}
		}
	}
}

function mypatsplit(string, array, pattern, seps,
			eosflag, non_empty, nf) # locals
{
	delete array
	delete seps
	if (length(string) == 0)
		return 0

	eosflag = non_empty = false
	nf = 0
	while (match(string, pattern)) {
		if (RLENGTH > 0) {	# easy case
			non_empty = true
			if (! (nf in seps)) {
				if (RSTART == 1)	# match at front of string
					seps[nf] = ""
				else
					seps[nf] = substr(string, 1, RSTART - 1)
			}
			array[++nf] = substr(string, RSTART, RLENGTH)
			string = substr(string, RSTART+RLENGTH)
			if (length(string) == 0)
				break
		} else if (non_empty) {
			# last match was non-empty, and at the
			# current character we get a zero length match,
			# which we don't want, so skip over it
			non_empty = false
			seps[nf] = substr(string, 1, 1)
			string = substr(string, 2)
		} else {
			# 0 length match
			if (! (nf in seps)) {
				if (RSTART == 1)
					seps[nf] = ""
				else
					seps[nf] = substr(string, 1, RSTART - 1)
			}
			array[++nf] = ""
			if (! non_empty && ! eosflag) { # prev was empty
				seps[nf] = substr(string, 1, 1)
			}
			if (RSTART == 1) {
				string = substr(string, 2)
			} else {
				string = substr(string, RSTART + 1)
			}
			non_empty = false
		}
		if (length(string) == 0) {
			if (eosflag)
				break
			else
				eosflag = true
		}
	}
	if (length(string) > 0)
		seps[nf] = string

	return length(array)
}