~ubuntu-branches/ubuntu/lucid/anki/lucid-updates

self.META.pathsToBeTagged = ['English for begginers', 'Advanced English 97', 'Phrasal Verbs'] # path patterns to be tagged - in gui entered like 'Advanced English 97|My Vocablary'

116

self.META.tagMemorizedItems = True # implemented

117

self.META.logToStdOutput = False # implemented

118

119

self.cards = []

120

121

## TOOLS

122

123

def _fudgeText(self, text):

124

"Replace sm syntax to Anki syntax"

125

text = text.replace("\n\r", u"<br>")

126

text = text.replace("\n", u"<br>")

127

return text

128

129

def _unicode2ascii(self,str):

130

"Remove diacritic punctuation from strings (titles)"

131

return u"".join([ c for c in unicodedata.normalize('NFKD', str) if not unicodedata.combining(c)])

132

133

def _decode_htmlescapes(self,s):

134

"""Unescape HTML code."""

135

#In case of bad formated html you can import MinimalSoup etc.. see btflsoup source code

136

from BeautifulSoup import BeautifulStoneSoup as btflsoup

137

138

#my sm2004 also ecaped & char in escaped sequences.

139

s = re.sub(u'&',u'&',s)

140

#unescaped solitary chars < or > that were ok for minidom confuse btfl soup

141

s = re.sub(u'>',u'>',s)

142

s = re.sub(u'<',u'<',s)

143

144

return unicode(btflsoup(s,convertEntities=btflsoup.HTML_ENTITIES ))

145

146

147

def _unescape(self,s,initilize):

148

"""Note: This method is not used, BeautifulSoup does better job.

149

"""

150

151

if self._unescape_trtable == None:

152

self._unescape_trtable = (

153

('€',u'€'), (' ',u' '), ('!',u'!'), ('"',u'"'), ('#',u'#'), ('$',u'$'), ('%',u'%'), ('&',u'&'), (''',u"'"),

154

('(',u'('), (')',u')'), ('*',u'*'), ('+',u'+'), (',',u','), ('-',u'-'), ('.',u'.'), ('/',u'/'), ('0',u'0'),

155

('1',u'1'), ('2',u'2'), ('3',u'3'), ('4',u'4'), ('5',u'5'), ('6',u'6'), ('7',u'7'), ('8',u'8'), ('9',u'9'),

156

(':',u':'), (';',u';'), ('<',u'<'), ('=',u'='), ('>',u'>'), ('?',u'?'), ('@',u'@'), ('A',u'A'), ('B',u'B'),

157

('C',u'C'), ('D',u'D'), ('E',u'E'), ('F',u'F'), ('G',u'G'), ('H',u'H'), ('I',u'I'), ('J',u'J'), ('K',u'K'),

158

('L',u'L'), ('M',u'M'), ('N',u'N'), ('O',u'O'), ('P',u'P'), ('Q',u'Q'), ('R',u'R'), ('S',u'S'), ('T',u'T'),

159

('U',u'U'), ('V',u'V'), ('W',u'W'), ('X',u'X'), ('Y',u'Y'), ('Z',u'Z'), ('[',u'['), ('\',u'\\'), (']',u']'),

160

('^',u'^'), ('_',u'_'), ('`',u'`'), ('a',u'a'), ('b',u'b'), ('c',u'c'), ('d',u'd'), ('e',u'e'), ('f',u'f'),

161

('g',u'g'), ('h',u'h'), ('i',u'i'), ('j',u'j'), ('k',u'k'), ('l',u'l'), ('m',u'm'), ('n',u'n'),

162

('o',u'o'), ('p',u'p'), ('q',u'q'), ('r',u'r'), ('s',u's'), ('t',u't'), ('u',u'u'), ('v',u'v'),

163

('w',u'w'), ('x',u'x'), ('y',u'y'), ('z',u'z'), ('{',u'{'), ('|',u'|'), ('}',u'}'), ('~',u'~'),

164

(' ',u' '), ('¡',u'¡'), ('¢',u'¢'), ('£',u'£'), ('¤',u'¤'), ('¥',u'¥'), ('¦',u'¦'), ('§',u'§'),

165

('¨',u'¨'), ('©',u'©'), ('ª',u'ª'), ('«',u'«'), ('¬',u'¬'), ('',u''), ('®',u'®'), ('¯',u'¯'),

166

('°',u'°'), ('±',u'±'), ('²',u'²'), ('³',u'³'), ('´',u'´'), ('µ',u'µ'), ('¶',u'¶'), ('·',u'·'),

167

('¸',u'¸'), ('¹',u'¹'), ('º',u'º'), ('»',u'»'), ('¼',u'¼'), ('½',u'½'), ('¾',u'¾'), ('¿',u'¿'),

168

('À',u'À'), ('Á',u'Á'), ('Â',u'Â'), ('Ã',u'Ã'), ('Ä',u'Ä'), ('Å',u'Å'), ('Å',u'Å'), ('Æ',u'Æ'),

169

('Ç',u'Ç'), ('È',u'È'), ('É',u'É'), ('Ê',u'Ê'), ('Ë',u'Ë'), ('Ì',u'Ì'), ('Í',u'Í'), ('Î',u'Î'),

170

('Ï',u'Ï'), ('Ð',u'Ð'), ('Ñ',u'Ñ'), ('Ò',u'Ò'), ('Ó',u'Ó'), ('Ô',u'Ô'), ('Õ',u'Õ'), ('Ö',u'Ö'),

171

('×',u'×'), ('Ø',u'Ø'), ('Ù',u'Ù'), ('Ú',u'Ú'), ('Û',u'Û'), ('Ü',u'Ü'), ('Ý',u'Ý'), ('Þ',u'Þ'),

172

('ß',u'ß'), ('à',u'à'), ('á',u'á'), ('â',u'â'), ('ã',u'ã'), ('ä',u'ä'), ('å',u'å'), ('æ',u'æ'),

173

('ç',u'ç'), ('è',u'è'), ('é',u'é'), ('ê',u'ê'), ('ë',u'ë'), ('ì',u'ì'), ('í',u'í'), ('í',u'í'),

174

('î',u'î'), ('ï',u'ï'), ('ð',u'ð'), ('ñ',u'ñ'), ('ò',u'ò'), ('ó',u'ó'), ('ô',u'ô'), ('õ',u'õ'),

175

('ö',u'ö'), ('÷',u'÷'), ('ø',u'ø'), ('ù',u'ù'), ('ú',u'ú'), ('û',u'û'), ('ü',u'ü'), ('ý',u'ý'),

176

('þ',u'þ'), ('ÿ',u'ÿ'), ('Ā',u'Ā'), ('ā',u'ā'), ('Ă',u'Ă'), ('ă',u'ă'), ('Ą',u'Ą'), ('ą',u'ą'),

177

('Ć',u'Ć'), ('ć',u'ć'), ('Ĉ',u'Ĉ'), ('ĉ',u'ĉ'), ('Ċ',u'Ċ'), ('ċ',u'ċ'), ('Č',u'Č'), ('č',u'č'),

178

('Ď',u'Ď'), ('ď',u'ď'), ('Đ',u'Đ'), ('đ',u'đ'), ('Ē',u'Ē'), ('ē',u'ē'), ('Ĕ',u'Ĕ'), ('ĕ',u'ĕ'),

179

('Ė',u'Ė'), ('ė',u'ė'), ('Ę',u'Ę'), ('ę',u'ę'), ('Ě',u'Ě'), ('ě',u'ě'), ('Ĝ',u'Ĝ'), ('ĝ',u'ĝ'),

180

('Ğ',u'Ğ'), ('ğ',u'ğ'), ('Ġ',u'Ġ'), ('ġ',u'ġ'), ('Ģ',u'Ģ'), ('ģ',u'ģ'), ('Ĥ',u'Ĥ'), ('ĥ',u'ĥ'),

181

('Ħ',u'Ħ'), ('ħ',u'ħ'), ('Ĩ',u'Ĩ'), ('ĩ',u'ĩ'), ('Ī',u'Ī'), ('ī',u'ī'), ('Ĭ',u'Ĭ'), ('ĭ',u'ĭ'),

182

('Į',u'Į'), ('į',u'į'), ('İ',u'İ'), ('ı',u'ı'), ('Ĳ',u'Ĳ'), ('ĳ',u'ĳ'), ('Ĵ',u'Ĵ'), ('ĵ',u'ĵ'),

183

('Ķ',u'Ķ'), ('ķ',u'ķ'), ('ĸ',u'ĸ'), ('Ĺ',u'Ĺ'), ('ĺ',u'ĺ'), ('Ļ',u'Ļ'), ('ļ',u'ļ'), ('Ľ',u'Ľ'),

184

('ľ',u'ľ'), ('Ŀ',u'Ŀ'), ('ŀ',u'ŀ'), ('Ł',u'Ł'), ('ł',u'ł'), ('Ń',u'Ń'), ('ń',u'ń'), ('Ņ',u'Ņ'),

185

('ņ',u'ņ'), ('Ň',u'Ň'), ('ň',u'ň'), ('ŉ',u'ŉ'), ('Ŋ',u'Ŋ'), ('ŋ',u'ŋ'), ('Ō',u'Ō'), ('ō',u'ō'),

186

('Ŏ',u'Ŏ'), ('ŏ',u'ŏ'), ('Ő',u'Ő'), ('ő',u'ő'), ('Œ',u'Œ'), ('œ',u'œ'), ('Ŕ',u'Ŕ'), ('ŕ',u'ŕ'),

187

('Ŗ',u'Ŗ'), ('ŗ',u'ŗ'), ('Ř',u'Ř'), ('ř',u'ř'), ('Ś',u'Ś'), ('ś',u'ś'), ('Ŝ',u'Ŝ'), ('ŝ',u'ŝ'),

188

('Ş',u'Ş'), ('ş',u'ş'), ('Š',u'Š'), ('š',u'š'), ('Ţ',u'Ţ'), ('ţ',u'ţ'), ('Ť',u'Ť'), ('ť',u'ť'),

189

('Ŧ',u'Ŧ'), ('ŧ',u'ŧ'), ('Ũ',u'Ũ'), ('ũ',u'ũ'), ('Ū',u'Ū'), ('ū',u'ū'), ('Ŭ',u'Ŭ'), ('ŭ',u'ŭ'),

190

('Ů',u'Ů'), ('ů',u'ů'), ('Ű',u'Ű'), ('ű',u'ű'), ('Ų',u'Ų'), ('ų',u'ų'), ('Ŵ',u'Ŵ'), ('ŵ',u'ŵ'),

191

('Ŷ',u'Ŷ'), ('ŷ',u'ŷ'), ('Ÿ',u'Ÿ'), ('Ź',u'Ź'), ('ź',u'ź'), ('Ż',u'Ż'), ('ż',u'ż'), ('Ž',u'Ž'),

192

('ž',u'ž'), ('ſ',u'ſ'), ('Ŕ',u'Ŕ'), ('ŕ',u'ŕ'), ('Ŗ',u'Ŗ'), ('ŗ',u'ŗ'), ('Ř',u'Ř'), ('ř',u'ř'),

193

('Ś',u'Ś'), ('ś',u'ś'), ('Ŝ',u'Ŝ'), ('ŝ',u'ŝ'), ('Ş',u'Ş'), ('ş',u'ş'), ('Š',u'Š'), ('š',u'š'),

194

('Ţ',u'Ţ'), ('ţ',u'ţ'), ('Ť',u'Ť'), ('Ɂ',u'ť'), ('Ŧ',u'Ŧ'), ('ŧ',u'ŧ'), ('Ũ',u'Ũ'), ('ũ',u'ũ'),

195

('Ū',u'Ū'), ('ū',u'ū'), ('Ŭ',u'Ŭ'), ('ŭ',u'ŭ'), ('Ů',u'Ů'), ('ů',u'ů'), ('Ű',u'Ű'), ('ű',u'ű'),

196

('Ų',u'Ų'), ('ų',u'ų'), ('Ŵ',u'Ŵ'), ('ŵ',u'ŵ'), ('Ŷ',u'Ŷ'), ('ŷ',u'ŷ'), ('Ÿ',u'Ÿ'), ('Ź',u'Ź'),

197

('ź',u'ź'), ('Ż',u'Ż'), ('ż',u'ż'), ('Ž',u'Ž'), ('ž',u'ž'), ('ſ',u'ſ'),

198

)

199

200

201

#m = re.match()

202

#s = s.replace(code[0], code[1])

203

204

## DEFAULT IMPORTER METHODS

205

206

def foreignCards(self):

207

208

# Load file and parse it by minidom

209

self.loadSource(self.file)

210

211

# Migrating content / time consuming part

212

# addItemToCards is called for each sm element

213

self.logger(u'Parsing started.')

214

self.parse()

215

self.logger(u'Parsing done.')

216

217

# Return imported cards

218

return self.cards

219

220

def fields(self):

221

return 2

222

223

## PARSER METHODS

224

225

def addItemToCards(self,item):

226

"This method actually do conversion"

227

228

# new anki card

229

card = ForeignCard()

230

231

# clean Q and A

232

card.fields.append(self._fudgeText(self._decode_htmlescapes(item.Question)))

233

card.fields.append(self._fudgeText(self._decode_htmlescapes(item.Answer)))

234

card.tags = u""

235

236

# pre-process scheduling data

237

tLastrep = time.mktime(time.strptime(item.LastRepetition, '%d.%m.%Y'))

238

tToday = time.time()

239

240

# convert learning data

241

if not self.META.resetLearningData:

242

# migration of LearningData algorithm

243

card.interval = item.Interval

244

card.successive = item.Repetitions

245

##card.due = tToday + (float(item.Interval) * 86400.0) - tLastrep

246

card.due = tLastrep + (float(item.Interval) * 86400.0)

247

card.lastDue = 0

248

249

card.factor = float(item.AFactor.replace(',','.'))

250

card.lastFactor = float(item.AFactor.replace(',','.'))

251

252

# SM is not exporting all the information Anki keeps track off, so it

253

# needs to be fudged

254

card.youngEase0 = item.Lapses

255

card.youngEase3 = item.Repetitions + item.Lapses

256

card.yesCount = item.Repetitions

257

card.noCount = item.Lapses

258

card.reps = card.yesCount + card.noCount

259

card.spaceUntil = card.due

260

card.combinedDue = card.due

261

262

# categories & tags

263

# it's worth to have every theme (tree structure of sm collection) stored in tags, but sometimes not

264

# you can deceide if you are going to tag all toppics or just that containing some pattern

265

tTaggTitle = False

266

for pattern in self.META.pathsToBeTagged:

267

if item.lTitle != None and pattern.lower() in u" ".join(item.lTitle).lower():

268

tTaggTitle = True

269

break

270

if tTaggTitle or self.META.tagAllTopics:

271

# normalize - remove diacritic punctuation from unicode chars to ascii

272

item.lTitle = [ self._unicode2ascii(topic) for topic in item.lTitle]

273

274

# Transfrom xyz / aaa / bbb / ccc on Title path to Tag xyzAaaBbbCcc

275

# clean things like [999] or [111-2222] from title path, example: xyz / [1000-1200] zyx / xyz

276

# clean whitespaces

277

# set Capital letters for first char of the word

278

tmp = list(set([ re.sub('(\[[0-9]+\])' , ' ' , i ).replace('_',' ') for i in item.lTitle ]))

279

tmp = list(set([ re.sub('(\W)',' ', i ) for i in tmp ]))

280

tmp = list(set([ re.sub( '^[0-9 ]+$','',i) for i in tmp ]))

281

tmp = list(set([ capwords(i).replace(' ','') for i in tmp ]))

282

tags = [ j[0].lower() + j[1:] for j in tmp if j.strip() <> '']

283

284

card.tags += u" ".join(tags)

285

286

if self.META.tagMemorizedItems and item.Interval >0:

287

card.tags += " Memorized"

288

289

self.logger(u'Element tags\t- ' + card.tags, level=3)

290

291

self.cards.append(card)

292

293

def logger(self,text,level=1):

294

"Wrapper for Anki logger"

295

296

dLevels={0:'',1:u'Info',2:u'Verbose',3:u'Debug'}

297

if level<=self.META.loggerLevel:

298

self.deck.updateProgress(_(text))

299

300

if self.META.logToStdOutput:

301

print self.__class__.__name__+ u" - " + dLevels[level].ljust(9) +u' -\t'+ _(text)

302

303

304

# OPEN AND LOAD

305

def openAnything(self,source):

306

"Open any source / actually only openig of files is used"

307

308

if source == "-":

309

return sys.stdin

310

311

# try to open with urllib (if source is http, ftp, or file URL)

312

import urllib

313

try:

314

return urllib.urlopen(source)

315

except (IOError, OSError):

316

pass

317

318

# try to open with native open function (if source is pathname)

319

try:

320

return open(source)

321

except (IOError, OSError):

322

pass

323

324

# treat source as string

325

import StringIO

326

return StringIO.StringIO(str(source))

327

328

def loadSource(self, source):

329

"""Load source file and parse with xml.dom.minidom"""

330

self.source = source

331

self.logger(u'Load started...')

332

sock = self.openAnything(self.source)

333

self.xmldoc = minidom.parse(sock).documentElement

334

sock.close()

335

self.logger(u'Load done.')

336

337

338

# PARSE

339

def parse(self, node=None):

340

"Parse method - parses document elements"

341

342

if node==None and self.xmldoc<>None:

343

node = self.xmldoc

344

345

_method = "parse_%s" % node.__class__.__name__

346

if hasattr(self,_method):

347

parseMethod = getattr(self, _method)

348

parseMethod(node)

349

else:

350

self.logger(u'No handler for method %s' % _method, level=3)

351

352

def parse_Document(self, node):

353

"Parse XML document"

354

355

self.parse(node.documentElement)

356

357

def parse_Element(self, node):

358

"Parse XML element"

359

360

_method = "do_%s" % node.tagName

361

if hasattr(self,_method):

362

handlerMethod = getattr(self, _method)

363

handlerMethod(node)

364

else:

365

self.logger(u'No handler for method %s' % _method, level=3)

366

#print traceback.print_exc()

367

368

def parse_Text(self, node):

369

"Parse text inside elements. Text is stored into local buffer."

370

371

text = node.data

372

self.cntBuf.append(text)

373

374

#def parse_Comment(self, node):

375

# """

376

# Source can contain XML comments, but we ignore them

377

# """

378

# pass

379

380

381

# DO

382

def do_SuperMemoCollection(self, node):

383

"Process SM Collection"

384

385

for child in node.childNodes: self.parse(child)

386

387

def do_SuperMemoElement(self, node):

388

"Process SM Element (Type - Title,Topics)"

389

390

self.logger('='*45, level=3)

391

392

self.cntElm.append(SuperMemoElement())

393

self.cntElm[-1]['lTitle'] = self.cntMeta['title']

394

395

#parse all child elements

396

for child in node.childNodes: self.parse(child)

397

398

#strip all saved strings, just for sure

399

for key in self.cntElm[-1].keys():

400

if hasattr(self.cntElm[-1][key], 'strip'):

401

self.cntElm[-1][key]=self.cntElm[-1][key].strip()

402

403

#pop current element

404

smel = self.cntElm.pop()

405

406

# Process cntElm if is valid Item (and not an Topic etc..)

407

# if smel.Lapses != None and smel.Interval != None and smel.Question != None and smel.Answer != None:

408

if smel.Title == None and smel.Question != None and smel.Answer != None:

409

if smel.Answer.strip() !='' and smel.Question.strip() !='':

410

411

# migrate only memorized otherway skip/continue

412

if self.META.onlyMemorizedItems and not(int(smel.Interval) > 0):

413

self.logger(u'Element skiped \t- not memorized ...', level=3)

414

else:

415

#import sm element data to Anki

416

self.addItemToCards(smel)

417

self.logger(u"Import element \t- " + smel['Question'], level=3)

418

419

#print element

420

self.logger('-'*45, level=3)

421

for key in smel.keys():

422

self.logger('\t%s %s' % ((key+':').ljust(15),smel[key]), level=3 )

423

else:

424

self.logger(u'Element skiped \t- no valid Q and A ...', level=3)

425

426

427

else:

428

# now we know that item was topic

429

# parseing of whole node is now finished

430

431

# test if it's really topic

432

if smel.Title != None:

433

# remove topic from title list

434

t = self.cntMeta['title'].pop()

435

self.logger(u'End of topic \t- %s' % (t), level=2)

436

437

def do_Content(self, node):

438

"Process SM element Content"

439

440

for child in node.childNodes:

441

if hasattr(child,'tagName') and child.firstChild != None:

442

self.cntElm[-1][child.tagName]=child.firstChild.data

443

444

def do_LearningData(self, node):

445

"Process SM element LearningData"

446

447

for child in node.childNodes:

448

if hasattr(child,'tagName') and child.firstChild != None:

449

self.cntElm[-1][child.tagName]=child.firstChild.data

450

451

# It's being processed in do_Content now

452

#def do_Question(self, node):

453

# for child in node.childNodes: self.parse(child)

454

# self.cntElm[-1][node.tagName]=self.cntBuf.pop()

455

456

# It's being processed in do_Content now

457

#def do_Answer(self, node):

458

# for child in node.childNodes: self.parse(child)

459

# self.cntElm[-1][node.tagName]=self.cntBuf.pop()

460

461

def do_Title(self, node):

462

"Process SM element Title"

463

464

t = self._decode_htmlescapes(node.firstChild.data)

465

self.cntElm[-1][node.tagName] = t

466

self.cntMeta['title'].append(t)

467

self.cntElm[-1]['lTitle'] = self.cntMeta['title']

468

self.logger(u'Start of topic \t- ' + u" / ".join(self.cntMeta['title']), level=2)

469

470

471

def do_Type(self, node):

472

"Process SM element Type"

473

474

if len(self.cntBuf) >=1 :

475

self.cntElm[-1][node.tagName]=self.cntBuf.pop()

476

477

478

if __name__ == '__main__':

479

480

# for testing you can start it standalone

481

482

#file = u'/home/epcim/hg2g/dev/python/sm2anki/ADVENG2EXP.xxe.esc.zaloha_FINAL.xml'

483

#file = u'/home/epcim/hg2g/dev/python/anki/libanki/tests/importing/supermemo/original_ENGLISHFORBEGGINERS_noOEM.xml'

484

#file = u'/home/epcim/hg2g/dev/python/anki/libanki/tests/importing/supermemo/original_ENGLISHFORBEGGINERS_oem_1250.xml'

485

file = str(sys.argv[1])

486

impo = SupermemoXmlImporter(Deck(),file)

487

impo.foreignCards()

488

489

sys.exit(1)

490

491

# vim: ts=4 sts=2 ft=python

Older »