1
import importer, plaintext_importer, re, string
2
from gourmet import check_encodings
3
from gourmet.gdebug import *
4
from gettext import gettext as _
6
MASTERCOOK_START_REGEXP='\s*\*\s*Exported\s*from\s*MasterCook.*\*\s*'
8
class mastercook_importer (plaintext_importer.TextImporter):
9
ATTR_DICT = {'Recipe By':'source',
10
'Serving Size':'servings',
11
'Preparation Time':'preptime',
12
'Categories':'category',
14
def __init__ (self, filename, rd, progress=None, threaded=False, conv=None):
15
self.progress = progress
16
self.compile_regexps()
19
self.in_instructions = False
21
self.looking_for_title = False
25
self.reccol_headers = False
26
plaintext_importer.TextImporter.__init__(self,filename,rd,progress=progress,threaded=threaded,
29
def compile_regexps (self):
30
plaintext_importer.TextImporter.compile_regexps(self)
31
self.rec_start_matcher = re.compile(MASTERCOOK_START_REGEXP)
32
self.blank_matcher = re.compile("^\s*$")
33
# strange thing has happened -- some archives have the column
34
# off by exactly 1 character, resulting in some fubar'ing of
35
# our parsing. to solve our problem, we first recognize
36
# rec_col_matcher, then parse fields using the ------
37
# underlining, which appears to line up even in fubared
39
self.rec_col_matcher = re.compile("(\s*Amount\s*)(Measure\s*)(Ingredient.*)")
40
self.rec_col_underline_matcher = re.compile("(\s*-+)(\s*-+)(\s*-+.*)")
41
# match a string enclosed in a possibly repeated non-word character
42
# such as *Group* or ---group--- or =======GROUP======
43
# grabbing groups()[1] will get you the enclosed string
44
self.dash_matcher = re.compile("^[ -]*[-][- ]*$")
45
self.ing_or_matcher = re.compile("\W*[Oo][Rr]\W*")
46
self.ing_group_matcher = re.compile("\s*(\W)\\1*(.+?)(\\1+)")
47
self.mods_matcher = re.compile("^\s*NOTES\.*")
48
attr_matcher = "\s*(" + string.join(self.ATTR_DICT.keys(),"|") + ")\s*:(.*)"
49
self.attr_matcher = re.compile(attr_matcher)
51
def handle_line (self, line):
52
if self.dash_matcher.match(line): return
53
if self.rec_start_matcher.match(line):
54
debug('rec_start! %s'%line,0)
55
self.looking_for_title = True
56
if self.rec: self.commit_rec()
59
self.in_instructions=False
65
if self.reccol_headers:
66
# we try to parse underlining after our standard ing headers.
67
rcm = self.rec_col_underline_matcher.match(line)
68
# if there is no underlining, use our headers themselves for fields
69
if not rcm: rcm = self.reccol_headers
70
debug('Found ing columns',0)
71
self.get_ing_cols(rcm)
73
self.reccol_headers=False
75
rcm=self.rec_col_matcher.match(line)
77
self.reccol_headers = rcm
78
self.looking_for_title=False
82
if self.blank_matcher.match(line):
83
# blank line ends ingredients
85
debug('blank line, end of ings',0)
87
self.in_instructions = True
88
if self.ing: self.commit_ing()
89
if self.in_instructions:
90
debug('blank line added to instructions: %s'%line,0)
91
if self.in_mods: self.mods += "\n"
92
else: self.instr+="\n"
94
if self.looking_for_title:
95
debug('found my title! %s'%line.strip(),0)
96
self.rec['title']=line.strip()
97
self.looking_for_title = False
101
debug('handling ingredient line %s'%line,0)
102
self.handle_ingline (line)
105
debug('handing attrline %s'%line,0)
106
self.handle_attribute(line)
109
self.in_instructions = True
110
if self.mods_matcher.match(line):
113
debug('handling modifications line %s'%line,0)
114
self.add_to_attr('mods',line)
116
debug('handling instructions line %s'%line,0)
117
self.add_to_attr('instr',line)
119
def add_to_attr (self, attr, txt):
120
orig = getattr(self,attr)
122
if len(txt.strip()) < 50:
123
setattr(self,attr,orig+"%s\n"%txt.strip())
124
elif not self.blank_matcher.match(orig[-1]):
125
setattr(self,attr,orig+" %s"%txt.strip())
127
setattr(self,attr,orig+txt.strip())
129
setattr(self,attr,txt)
131
def get_ing_cols (self,rcm):
132
amt,unit,itm=rcm.groups()
133
lamt,lunit,litm = len(amt),len(unit),len(itm)
134
self.amt_col = 0,lamt
135
self.unit_col = lamt,lamt+lunit
136
self.itm_col = lamt+lunit,None
138
def handle_attribute (self,line):
139
m=self.attr_matcher.match(line)
141
attr,val = m.groups()
142
SecndColMatch = self.attr_matcher.search(val)
144
s=SecndColMatch.start()
145
self.handle_attribute(val[s:])
147
val = self.join_multiple_attvals(val.strip())
149
self.last_attr = self.ATTR_DICT[attr]
150
self.rec[self.ATTR_DICT[attr]]=val
153
# attribute values can run over one line...
154
self.rec[self.last_attr]=', '.join([self.rec[self.last_attr],
155
self.join_multiple_attvals(line.strip())
158
# otherwise, we add this to instructions, like we do with all junk
161
def join_multiple_attvals (self, txt):
162
"""We take replace more than one space with a comma."""
163
return ', '.join(re.split(' +',txt))
165
def handle_ingline (self,line):
166
if self.ing_or_matcher.match(line):
169
amt = line.__getslice__(*self.amt_col).strip()
170
unit = line.__getslice__(*self.unit_col).strip()
171
itm = line[self.itm_col[0]:].strip()
172
gm=self.ing_group_matcher.match(itm)
174
if self.ing: self.commit_ing()
175
self.group = gm.groups()[1]
176
# undo grouping if it has no letters...
177
if re.match('^[^A-Za-z]*$',self.group): self.group=None
180
if self.in_or: self.ing['optional']=True
181
if self.ing: self.commit_ing()
184
self.ing['optional']=True
190
elif self.ing and self.ing.has_key('item'):
191
# otherwise, we assume we are a continuation and
192
# add onto the previous item
193
self.ing['item']=self.ing['item']+' '+itm.strip()
195
debug('"%s" in the midst of ingredients looks like instructions!'%itm.strip(),2)
196
self.instr += "\n"+itm.strip()
198
def commit_ing (self):
199
if not self.ing.has_key('item'):
201
key_base = self.ing['item'].split('--')[0]
202
self.ing['ingkey']=self.km.get_key_fast(key_base)
203
importer.importer.commit_ing(self)
206
def commit_rec (self):
207
ll=self.instr.split('\n')
208
self.rec['instructions']=self.unwrap_lines(self.instr)
209
self.rec['modifications']=self.unwrap_lines(self.mods)
210
importer.importer.commit_rec(self)
212
class Tester (importer.Tester):
214
importer.Tester.__init__(self,regexp=MASTERCOOK_START_REGEXP)
215
self.not_me = "<[?]?(xml|mx2|RcpE|RTxt)[^>]*>"
217
def test (self, filename):
218
if not hasattr(self,'matcher'):
219
self.matcher=re.compile(self.regexp)
220
self.not_matcher = re.compile(self.not_me)
221
if type(filename)==str:
222
self.ofi = open(filename,'r')
227
l = self.ofi.readline()
229
if self.not_matcher.match(l):
232
if self.matcher.match(l):
235
l = self.ofi.readline()
236
if CLOSE: self.ofi.close()
237
else: self.ofi.seek(0)