1
# -*- coding: utf-8 -*-
2
# Copyright: petr.michalec@gmail.com
3
# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html
6
Importing Supermemo XML decks
7
==============================
9
__docformat__ = 'restructuredtext'
13
from anki.importing import Importer, ForeignCard
14
from anki.lang import _
15
from anki.errors import *
17
from xml.dom import minidom, Node
18
from types import DictType, InstanceType
19
from string import capwords, maketrans
20
import re, unicodedata, time
24
from anki.deck import Deck
26
class SmartDict(dict):
28
See http://www.peterbe.com/plog/SmartDict
29
Copyright 2005, Peter Bengtsson, peter@fry-it.com
31
A smart dict can be instanciated either from a pythonic dict
32
or an instance object (eg. SQL recordsets) but it ensures that you can
33
do all the convenient lookups such as x.first_name, x['first_name'] or
37
def __init__(self, *a, **kw):
39
if type(a[0]) is DictType:
41
elif type(a[0]) is InstanceType:
42
kw.update(a[0].__dict__)
43
elif hasattr(a[0], '__class__') and a[0].__class__.__name__=='SmartDict':
44
kw.update(a[0].__dict__)
46
dict.__init__(self, **kw)
49
class SuperMemoElement(SmartDict):
50
"SmartDict wrapper to store SM Element data"
52
def __init__(self, *a, **kw):
53
SmartDict.__init__(self, *a, **kw)
55
self.__dict__['lTitle'] = None
56
self.__dict__['Title'] = None
57
self.__dict__['Question'] = None
58
self.__dict__['Answer'] = None
59
self.__dict__['Count'] = None
60
self.__dict__['Type'] = None
61
self.__dict__['ID'] = None
62
self.__dict__['Interval'] = None
63
self.__dict__['Lapses'] = None
64
self.__dict__['Repetitions'] = None
65
self.__dict__['LastRepetiton'] = None
66
self.__dict__['AFactor'] = None
67
self.__dict__['UFactor'] = None
71
# This is an AnkiImporter
72
class SupermemoXmlImporter(Importer):
74
Supermemo XML export's to Anki parser.
75
Goes through a SM collection and fetch all elements.
77
My SM collection was a big mess where topics and items were mixed.
78
I was unable to parse my content in a regular way like for loop on
79
minidom.getElementsByTagName() etc. My collection had also an
80
limitation, topics were splited into branches with max 100 items
81
on each. Learning themes were in deep structure. I wanted to have
82
full title on each element to be stored in tags.
84
Code should be upgrade to support importing of SM2006 exports.
87
def __init__(self, *args):
88
"""Initialize internal varables.
89
Pameters to be exposed to GUI are stored in self.META"""
91
Importer.__init__(self, *args)
95
# SmXmlParse VARIABLES
98
self.cntBuf = [] #to store last parsed data
99
self.cntElm = [] #to store SM Elements data
100
self.cntCol = [] #to store SM Colections data
102
# store some meta info related to parse algorithm
103
# SmartDict works like dict / class wrapper
104
self.cntMeta = SmartDict()
105
self.cntMeta.popTitles = False
106
self.cntMeta.title = []
108
# META stores controls of import scritp, should be
109
# exposed to import dialog. These are default values.
110
self.META = SmartDict()
111
self.META.resetLearningData = False # implemented
112
self.META.onlyMemorizedItems = False # implemented
113
self.META.loggerLevel = 2 # implemented 0no,1info,2error,3debug
114
self.META.tagAllTopics = True
115
self.META.pathsToBeTagged = ['English for begginers', 'Advanced English 97', 'Phrasal Verbs'] # path patterns to be tagged - in gui entered like 'Advanced English 97|My Vocablary'
116
self.META.tagMemorizedItems = True # implemented
117
self.META.logToStdOutput = False # implemented
123
def _fudgeText(self, text):
124
"Replace sm syntax to Anki syntax"
125
text = text.replace("\n\r", u"<br>")
126
text = text.replace("\n", u"<br>")
129
def _unicode2ascii(self,str):
130
"Remove diacritic punctuation from strings (titles)"
131
return u"".join([ c for c in unicodedata.normalize('NFKD', str) if not unicodedata.combining(c)])
133
def _decode_htmlescapes(self,s):
134
"""Unescape HTML code."""
135
#In case of bad formated html you can import MinimalSoup etc.. see btflsoup source code
136
from BeautifulSoup import BeautifulStoneSoup as btflsoup
138
#my sm2004 also ecaped & char in escaped sequences.
139
s = re.sub(u'&',u'&',s)
140
#unescaped solitary chars < or > that were ok for minidom confuse btfl soup
141
s = re.sub(u'>',u'>',s)
142
s = re.sub(u'<',u'<',s)
144
return unicode(btflsoup(s,convertEntities=btflsoup.HTML_ENTITIES ))
147
def _unescape(self,s,initilize):
148
"""Note: This method is not used, BeautifulSoup does better job.
151
if self._unescape_trtable == None:
152
self._unescape_trtable = (
153
('€',u'€'), (' ',u' '), ('!',u'!'), ('"',u'"'), ('#',u'#'), ('$',u'$'), ('%',u'%'), ('&',u'&'), (''',u"'"),
154
('(',u'('), (')',u')'), ('*',u'*'), ('+',u'+'), (',',u','), ('-',u'-'), ('.',u'.'), ('/',u'/'), ('0',u'0'),
155
('1',u'1'), ('2',u'2'), ('3',u'3'), ('4',u'4'), ('5',u'5'), ('6',u'6'), ('7',u'7'), ('8',u'8'), ('9',u'9'),
156
(':',u':'), (';',u';'), ('<',u'<'), ('=',u'='), ('>',u'>'), ('?',u'?'), ('@',u'@'), ('A',u'A'), ('B',u'B'),
157
('C',u'C'), ('D',u'D'), ('E',u'E'), ('F',u'F'), ('G',u'G'), ('H',u'H'), ('I',u'I'), ('J',u'J'), ('K',u'K'),
158
('L',u'L'), ('M',u'M'), ('N',u'N'), ('O',u'O'), ('P',u'P'), ('Q',u'Q'), ('R',u'R'), ('S',u'S'), ('T',u'T'),
159
('U',u'U'), ('V',u'V'), ('W',u'W'), ('X',u'X'), ('Y',u'Y'), ('Z',u'Z'), ('[',u'['), ('\',u'\\'), (']',u']'),
160
('^',u'^'), ('_',u'_'), ('`',u'`'), ('a',u'a'), ('b',u'b'), ('c',u'c'), ('d',u'd'), ('e',u'e'), ('f',u'f'),
161
('g',u'g'), ('h',u'h'), ('i',u'i'), ('j',u'j'), ('k',u'k'), ('l',u'l'), ('m',u'm'), ('n',u'n'),
162
('o',u'o'), ('p',u'p'), ('q',u'q'), ('r',u'r'), ('s',u's'), ('t',u't'), ('u',u'u'), ('v',u'v'),
163
('w',u'w'), ('x',u'x'), ('y',u'y'), ('z',u'z'), ('{',u'{'), ('|',u'|'), ('}',u'}'), ('~',u'~'),
164
(' ',u' '), ('¡',u'¡'), ('¢',u'¢'), ('£',u'£'), ('¤',u'¤'), ('¥',u'¥'), ('¦',u'¦'), ('§',u'§'),
165
('¨',u'¨'), ('©',u'©'), ('ª',u'ª'), ('«',u'«'), ('¬',u'¬'), ('­',u''), ('®',u'®'), ('¯',u'¯'),
166
('°',u'°'), ('±',u'±'), ('²',u'²'), ('³',u'³'), ('´',u'´'), ('µ',u'µ'), ('¶',u'¶'), ('·',u'·'),
167
('¸',u'¸'), ('¹',u'¹'), ('º',u'º'), ('»',u'»'), ('¼',u'¼'), ('½',u'½'), ('¾',u'¾'), ('¿',u'¿'),
168
('À',u'À'), ('Á',u'Á'), ('Â',u'Â'), ('Ã',u'Ã'), ('Ä',u'Ä'), ('Å',u'Å'), ('Å',u'Å'), ('Æ',u'Æ'),
169
('Ç',u'Ç'), ('È',u'È'), ('É',u'É'), ('Ê',u'Ê'), ('Ë',u'Ë'), ('Ì',u'Ì'), ('Í',u'Í'), ('Î',u'Î'),
170
('Ï',u'Ï'), ('Ð',u'Ð'), ('Ñ',u'Ñ'), ('Ò',u'Ò'), ('Ó',u'Ó'), ('Ô',u'Ô'), ('Õ',u'Õ'), ('Ö',u'Ö'),
171
('×',u'×'), ('Ø',u'Ø'), ('Ù',u'Ù'), ('Ú',u'Ú'), ('Û',u'Û'), ('Ü',u'Ü'), ('Ý',u'Ý'), ('Þ',u'Þ'),
172
('ß',u'ß'), ('à',u'à'), ('á',u'á'), ('â',u'â'), ('ã',u'ã'), ('ä',u'ä'), ('å',u'å'), ('æ',u'æ'),
173
('ç',u'ç'), ('è',u'è'), ('é',u'é'), ('ê',u'ê'), ('ë',u'ë'), ('ì',u'ì'), ('í',u'í'), ('í',u'í'),
174
('î',u'î'), ('ï',u'ï'), ('ð',u'ð'), ('ñ',u'ñ'), ('ò',u'ò'), ('ó',u'ó'), ('ô',u'ô'), ('õ',u'õ'),
175
('ö',u'ö'), ('÷',u'÷'), ('ø',u'ø'), ('ù',u'ù'), ('ú',u'ú'), ('û',u'û'), ('ü',u'ü'), ('ý',u'ý'),
176
('þ',u'þ'), ('ÿ',u'ÿ'), ('Ā',u'Ā'), ('ā',u'ā'), ('Ă',u'Ă'), ('ă',u'ă'), ('Ą',u'Ą'), ('ą',u'ą'),
177
('Ć',u'Ć'), ('ć',u'ć'), ('Ĉ',u'Ĉ'), ('ĉ',u'ĉ'), ('Ċ',u'Ċ'), ('ċ',u'ċ'), ('Č',u'Č'), ('č',u'č'),
178
('Ď',u'Ď'), ('ď',u'ď'), ('Đ',u'Đ'), ('đ',u'đ'), ('Ē',u'Ē'), ('ē',u'ē'), ('Ĕ',u'Ĕ'), ('ĕ',u'ĕ'),
179
('Ė',u'Ė'), ('ė',u'ė'), ('Ę',u'Ę'), ('ę',u'ę'), ('Ě',u'Ě'), ('ě',u'ě'), ('Ĝ',u'Ĝ'), ('ĝ',u'ĝ'),
180
('Ğ',u'Ğ'), ('ğ',u'ğ'), ('Ġ',u'Ġ'), ('ġ',u'ġ'), ('Ģ',u'Ģ'), ('ģ',u'ģ'), ('Ĥ',u'Ĥ'), ('ĥ',u'ĥ'),
181
('Ħ',u'Ħ'), ('ħ',u'ħ'), ('Ĩ',u'Ĩ'), ('ĩ',u'ĩ'), ('Ī',u'Ī'), ('ī',u'ī'), ('Ĭ',u'Ĭ'), ('ĭ',u'ĭ'),
182
('Į',u'Į'), ('į',u'į'), ('İ',u'İ'), ('ı',u'ı'), ('IJ',u'IJ'), ('ij',u'ij'), ('Ĵ',u'Ĵ'), ('ĵ',u'ĵ'),
183
('Ķ',u'Ķ'), ('ķ',u'ķ'), ('ĸ',u'ĸ'), ('Ĺ',u'Ĺ'), ('ĺ',u'ĺ'), ('Ļ',u'Ļ'), ('ļ',u'ļ'), ('Ľ',u'Ľ'),
184
('ľ',u'ľ'), ('Ŀ',u'Ŀ'), ('ŀ',u'ŀ'), ('Ł',u'Ł'), ('ł',u'ł'), ('Ń',u'Ń'), ('ń',u'ń'), ('Ņ',u'Ņ'),
185
('ņ',u'ņ'), ('Ň',u'Ň'), ('ň',u'ň'), ('ʼn',u'ʼn'), ('Ŋ',u'Ŋ'), ('ŋ',u'ŋ'), ('Ō',u'Ō'), ('ō',u'ō'),
186
('Ŏ',u'Ŏ'), ('ŏ',u'ŏ'), ('Ő',u'Ő'), ('ő',u'ő'), ('Œ',u'Œ'), ('œ',u'œ'), ('Ŕ',u'Ŕ'), ('ŕ',u'ŕ'),
187
('Ŗ',u'Ŗ'), ('ŗ',u'ŗ'), ('Ř',u'Ř'), ('ř',u'ř'), ('Ś',u'Ś'), ('ś',u'ś'), ('Ŝ',u'Ŝ'), ('ŝ',u'ŝ'),
188
('Ş',u'Ş'), ('ş',u'ş'), ('Š',u'Š'), ('š',u'š'), ('Ţ',u'Ţ'), ('ţ',u'ţ'), ('Ť',u'Ť'), ('ť',u'ť'),
189
('Ŧ',u'Ŧ'), ('ŧ',u'ŧ'), ('Ũ',u'Ũ'), ('ũ',u'ũ'), ('Ū',u'Ū'), ('ū',u'ū'), ('Ŭ',u'Ŭ'), ('ŭ',u'ŭ'),
190
('Ů',u'Ů'), ('ů',u'ů'), ('Ű',u'Ű'), ('ű',u'ű'), ('Ų',u'Ų'), ('ų',u'ų'), ('Ŵ',u'Ŵ'), ('ŵ',u'ŵ'),
191
('Ŷ',u'Ŷ'), ('ŷ',u'ŷ'), ('Ÿ',u'Ÿ'), ('Ź',u'Ź'), ('ź',u'ź'), ('Ż',u'Ż'), ('ż',u'ż'), ('Ž',u'Ž'),
192
('ž',u'ž'), ('ſ',u'ſ'), ('Ŕ',u'Ŕ'), ('ŕ',u'ŕ'), ('Ŗ',u'Ŗ'), ('ŗ',u'ŗ'), ('Ř',u'Ř'), ('ř',u'ř'),
193
('Ś',u'Ś'), ('ś',u'ś'), ('Ŝ',u'Ŝ'), ('ŝ',u'ŝ'), ('Ş',u'Ş'), ('ş',u'ş'), ('Š',u'Š'), ('š',u'š'),
194
('Ţ',u'Ţ'), ('ţ',u'ţ'), ('Ť',u'Ť'), ('Ɂ',u'ť'), ('Ŧ',u'Ŧ'), ('ŧ',u'ŧ'), ('Ũ',u'Ũ'), ('ũ',u'ũ'),
195
('Ū',u'Ū'), ('ū',u'ū'), ('Ŭ',u'Ŭ'), ('ŭ',u'ŭ'), ('Ů',u'Ů'), ('ů',u'ů'), ('Ű',u'Ű'), ('ű',u'ű'),
196
('Ų',u'Ų'), ('ų',u'ų'), ('Ŵ',u'Ŵ'), ('ŵ',u'ŵ'), ('Ŷ',u'Ŷ'), ('ŷ',u'ŷ'), ('Ÿ',u'Ÿ'), ('Ź',u'Ź'),
197
('ź',u'ź'), ('Ż',u'Ż'), ('ż',u'ż'), ('Ž',u'Ž'), ('ž',u'ž'), ('ſ',u'ſ'),
202
#s = s.replace(code[0], code[1])
204
## DEFAULT IMPORTER METHODS
206
def foreignCards(self):
208
# Load file and parse it by minidom
209
self.loadSource(self.file)
211
# Migrating content / time consuming part
212
# addItemToCards is called for each sm element
213
self.logger(u'Parsing started.')
215
self.logger(u'Parsing done.')
217
# Return imported cards
225
def addItemToCards(self,item):
226
"This method actually do conversion"
232
card.fields.append(self._fudgeText(self._decode_htmlescapes(item.Question)))
233
card.fields.append(self._fudgeText(self._decode_htmlescapes(item.Answer)))
236
# pre-process scheduling data
237
tLastrep = time.mktime(time.strptime(item.LastRepetition, '%d.%m.%Y'))
240
# convert learning data
241
if not self.META.resetLearningData:
242
# migration of LearningData algorithm
243
card.interval = item.Interval
244
card.successive = item.Repetitions
245
##card.due = tToday + (float(item.Interval) * 86400.0) - tLastrep
246
card.due = tLastrep + (float(item.Interval) * 86400.0)
249
card.factor = float(item.AFactor.replace(',','.'))
250
card.lastFactor = float(item.AFactor.replace(',','.'))
252
# SM is not exporting all the information Anki keeps track off, so it
254
card.youngEase0 = item.Lapses
255
card.youngEase3 = item.Repetitions + item.Lapses
256
card.yesCount = item.Repetitions
257
card.noCount = item.Lapses
258
card.reps = card.yesCount + card.noCount
259
card.spaceUntil = card.due
260
card.combinedDue = card.due
263
# it's worth to have every theme (tree structure of sm collection) stored in tags, but sometimes not
264
# you can deceide if you are going to tag all toppics or just that containing some pattern
266
for pattern in self.META.pathsToBeTagged:
267
if item.lTitle != None and pattern.lower() in u" ".join(item.lTitle).lower():
270
if tTaggTitle or self.META.tagAllTopics:
271
# normalize - remove diacritic punctuation from unicode chars to ascii
272
item.lTitle = [ self._unicode2ascii(topic) for topic in item.lTitle]
274
# Transfrom xyz / aaa / bbb / ccc on Title path to Tag xyzAaaBbbCcc
275
# clean things like [999] or [111-2222] from title path, example: xyz / [1000-1200] zyx / xyz
277
# set Capital letters for first char of the word
278
tmp = list(set([ re.sub('(\[[0-9]+\])' , ' ' , i ).replace('_',' ') for i in item.lTitle ]))
279
tmp = list(set([ re.sub('(\W)',' ', i ) for i in tmp ]))
280
tmp = list(set([ re.sub( '^[0-9 ]+$','',i) for i in tmp ]))
281
tmp = list(set([ capwords(i).replace(' ','') for i in tmp ]))
282
tags = [ j[0].lower() + j[1:] for j in tmp if j.strip() <> '']
284
card.tags += u" ".join(tags)
286
if self.META.tagMemorizedItems and item.Interval >0:
287
card.tags += " Memorized"
289
self.logger(u'Element tags\t- ' + card.tags, level=3)
291
self.cards.append(card)
293
def logger(self,text,level=1):
294
"Wrapper for Anki logger"
296
dLevels={0:'',1:u'Info',2:u'Verbose',3:u'Debug'}
297
if level<=self.META.loggerLevel:
298
self.deck.updateProgress(_(text))
300
if self.META.logToStdOutput:
301
print self.__class__.__name__+ u" - " + dLevels[level].ljust(9) +u' -\t'+ _(text)
305
def openAnything(self,source):
306
"Open any source / actually only openig of files is used"
311
# try to open with urllib (if source is http, ftp, or file URL)
314
return urllib.urlopen(source)
315
except (IOError, OSError):
318
# try to open with native open function (if source is pathname)
321
except (IOError, OSError):
324
# treat source as string
326
return StringIO.StringIO(str(source))
328
def loadSource(self, source):
329
"""Load source file and parse with xml.dom.minidom"""
331
self.logger(u'Load started...')
332
sock = self.openAnything(self.source)
333
self.xmldoc = minidom.parse(sock).documentElement
335
self.logger(u'Load done.')
339
def parse(self, node=None):
340
"Parse method - parses document elements"
342
if node==None and self.xmldoc<>None:
345
_method = "parse_%s" % node.__class__.__name__
346
if hasattr(self,_method):
347
parseMethod = getattr(self, _method)
350
self.logger(u'No handler for method %s' % _method, level=3)
352
def parse_Document(self, node):
355
self.parse(node.documentElement)
357
def parse_Element(self, node):
360
_method = "do_%s" % node.tagName
361
if hasattr(self,_method):
362
handlerMethod = getattr(self, _method)
365
self.logger(u'No handler for method %s' % _method, level=3)
366
#print traceback.print_exc()
368
def parse_Text(self, node):
369
"Parse text inside elements. Text is stored into local buffer."
372
self.cntBuf.append(text)
374
#def parse_Comment(self, node):
376
# Source can contain XML comments, but we ignore them
382
def do_SuperMemoCollection(self, node):
383
"Process SM Collection"
385
for child in node.childNodes: self.parse(child)
387
def do_SuperMemoElement(self, node):
388
"Process SM Element (Type - Title,Topics)"
390
self.logger('='*45, level=3)
392
self.cntElm.append(SuperMemoElement())
393
self.cntElm[-1]['lTitle'] = self.cntMeta['title']
395
#parse all child elements
396
for child in node.childNodes: self.parse(child)
398
#strip all saved strings, just for sure
399
for key in self.cntElm[-1].keys():
400
if hasattr(self.cntElm[-1][key], 'strip'):
401
self.cntElm[-1][key]=self.cntElm[-1][key].strip()
404
smel = self.cntElm.pop()
406
# Process cntElm if is valid Item (and not an Topic etc..)
407
# if smel.Lapses != None and smel.Interval != None and smel.Question != None and smel.Answer != None:
408
if smel.Title == None and smel.Question != None and smel.Answer != None:
409
if smel.Answer.strip() !='' and smel.Question.strip() !='':
411
# migrate only memorized otherway skip/continue
412
if self.META.onlyMemorizedItems and not(int(smel.Interval) > 0):
413
self.logger(u'Element skiped \t- not memorized ...', level=3)
415
#import sm element data to Anki
416
self.addItemToCards(smel)
417
self.logger(u"Import element \t- " + smel['Question'], level=3)
420
self.logger('-'*45, level=3)
421
for key in smel.keys():
422
self.logger('\t%s %s' % ((key+':').ljust(15),smel[key]), level=3 )
424
self.logger(u'Element skiped \t- no valid Q and A ...', level=3)
428
# now we know that item was topic
429
# parseing of whole node is now finished
431
# test if it's really topic
432
if smel.Title != None:
433
# remove topic from title list
434
t = self.cntMeta['title'].pop()
435
self.logger(u'End of topic \t- %s' % (t), level=2)
437
def do_Content(self, node):
438
"Process SM element Content"
440
for child in node.childNodes:
441
if hasattr(child,'tagName') and child.firstChild != None:
442
self.cntElm[-1][child.tagName]=child.firstChild.data
444
def do_LearningData(self, node):
445
"Process SM element LearningData"
447
for child in node.childNodes:
448
if hasattr(child,'tagName') and child.firstChild != None:
449
self.cntElm[-1][child.tagName]=child.firstChild.data
451
# It's being processed in do_Content now
452
#def do_Question(self, node):
453
# for child in node.childNodes: self.parse(child)
454
# self.cntElm[-1][node.tagName]=self.cntBuf.pop()
456
# It's being processed in do_Content now
457
#def do_Answer(self, node):
458
# for child in node.childNodes: self.parse(child)
459
# self.cntElm[-1][node.tagName]=self.cntBuf.pop()
461
def do_Title(self, node):
462
"Process SM element Title"
464
t = self._decode_htmlescapes(node.firstChild.data)
465
self.cntElm[-1][node.tagName] = t
466
self.cntMeta['title'].append(t)
467
self.cntElm[-1]['lTitle'] = self.cntMeta['title']
468
self.logger(u'Start of topic \t- ' + u" / ".join(self.cntMeta['title']), level=2)
471
def do_Type(self, node):
472
"Process SM element Type"
474
if len(self.cntBuf) >=1 :
475
self.cntElm[-1][node.tagName]=self.cntBuf.pop()
478
if __name__ == '__main__':
480
# for testing you can start it standalone
482
#file = u'/home/epcim/hg2g/dev/python/sm2anki/ADVENG2EXP.xxe.esc.zaloha_FINAL.xml'
483
#file = u'/home/epcim/hg2g/dev/python/anki/libanki/tests/importing/supermemo/original_ENGLISHFORBEGGINERS_noOEM.xml'
484
#file = u'/home/epcim/hg2g/dev/python/anki/libanki/tests/importing/supermemo/original_ENGLISHFORBEGGINERS_oem_1250.xml'
485
file = str(sys.argv[1])
486
impo = SupermemoXmlImporter(Deck(),file)
491
# vim: ts=4 sts=2 ft=python