2
# science_log_parser.py <Peter.Bienstman@UGent.be> starting from code of
3
# Ryan Michael <kerinin@gmail.com> and Joonatan Kaartinen <jkaartinen@iki.fi>
11
from mnemosyne.libmnemosyne.utils import traceback_string
13
DAY = 24 * 60 * 60 # Seconds in a day.
16
class ScienceLogParser(object):
18
"""Parse the txt logs and write the info it contains to a database
21
This is complicated by several idiosyncrasies.
23
First, before 2.0, dates were only stored with a resolution of a day.
24
However, the timestamps of the logs make it possible to determine e.g.
25
the actual interval with a resolution of a second. This however requires
26
holding on to the exact time of the previous repetition of each card,
27
while parsing the logs.
29
A second, more thorny idiosyncrasy is the matter of the first grading of a
30
card. When adding cards manually through the UI, users set an initial
31
grade there. For cards that are imported, there is no such possibility.
33
Throughout the history of Mnemosyne, several approaches have been taken
34
to deal with this issue.
36
Before 0.9.8, the inconsistency was simply ignored, and the initial
37
grading of a card in the 'add cards' dialog was not counted as a grading.
39
Starting with 0.9.8, Dirk Hermann recognised the inconsistency, and
40
regardless of grade, the initial grading of a card in the 'add cards'
41
dialog was always counted as an acquisition repetition. So, before any 'R'
42
log entry of a card added through the 'add cards' dialog, 'acq_reps' was
43
already 1. For cards imported, 'acq_reps' stayed zero.
45
When later on Mnemosyne acquired the possibility to learn new cards in
46
random order, a new card attribute 'unseen' needed to be introduced,
47
keeping track of whether the card was already seen in the interactive
48
review process. This hack complicated the code, and therefore, starting
49
with Mnemosyne 2.0, a different approach was taken. The initial grading
50
of a card in the 'add cards' dialog was only counted as an acquisition
51
repetition (and explicitly logged as an 'R' event) when the grade was 2
52
or higher. In other cases, the grade got set to -1 (signifying unseen),
53
just as for imported cards.
55
When parsing old logs, the data needs to be adjusted to fit the latest
56
scheme. To clarify what needs to be done, the following table shows a
57
summary of the contents of the logs after creating a new card through
58
the GUI (giving it an initial grade at the same time), as well as the
59
value of 'acq_reps' at that time.
61
initial grade 0,1 initial grade 2,3,4,5
63
version < 0.9.8: New item, acq_reps=0 New item, acq_reps=0
65
0.9.8 <= version < 2.0 New item, acq_reps=1 New item, acq_reps=1
67
2.0 <= version New item, acq_reps=0 New item, acq_reps=0
71
So, to convert to the latest scheme, we need the following actions:
74
initial grade 0,1 initial grade 2,3,4,5
76
version < 0.9.8: None add R at creation
79
0.9.8 <= version < 2.0 decrease acq_reps add R at creation
81
Since there is no grading on import, we don't need to do anything special
84
The database object should implement the following API:
86
def log_started_program(self, timestamp, program_name_version)
87
def log_stopped_program(self, timestamp)
88
def log_started_scheduler(self, timestamp, scheduler_name)
89
def log_loaded_database(self, timestamp, scheduled_count,
90
def log_saved_database(self, timestamp, scheduled_count,
91
def log_added_card(self, timestamp, card_id)
92
def log_deleted_card(self, timestamp, card_id)
93
def log_repetition(self, timestamp, card_id, grade, easiness, acq_reps,
94
ret_reps, lapses, acq_reps_since_lapse, ret_reps_since_lapse,
95
scheduled_interval, actual_interval, new_interval, thinking_time)
96
def set_offset_last_rep(self, card_id, offset, last_rep)
97
def offset_last_rep(self, card_id)
98
def update_card_after_log_import(id, creation_time, offset)
100
Note that we don't go through the log() level of abstraction here, as this
101
code is also used outside libmnemosyne for parsing logs in the statistics
106
versions_1_x_phase_1 = ["0.1", "0.9", "0.9.1", "0.9.2", "0.9.3", "0.9.4",
107
"0.9.5","0.9.6","0.9.7"]
109
versions_1_x_phase_2 = ["0.9.8", "0.9.8.1", "0.9.9", "0.9.10", "1.0",
110
"1.0.1", "1.0.1.1", "1.0.2", "1.1", "1.1.1", "1.2",
113
def __init__(self, database, ids_to_parse=None):
115
"""Only convertings ids in 'ids_to_parse' makes it possible to reliably
116
import different mem files (which all share the same log files).
117
For efficiency reasons, 'ids_to_parse' is best a dictionary.
121
self.database = database
122
self.ids_to_parse = ids_to_parse
123
self.version_number = "1.2.2" # Default guess for missing logs.
125
def parse(self, filename):
127
if os.path.basename(filename) != "log.txt":
128
before_extension = os.path.basename(filename).split(".")[0]
129
if before_extension.count("_") == 1:
130
self.user_id, self.log_number = before_extension.split("_")
132
self.user_id, self.machine_id, self.log_number = \
133
before_extension.split("_")
134
self.log_number = int(self.log_number)
135
if filename.endswith(".bz2"):
136
self.logfile = bz2.BZ2File(filename)
138
self.logfile = file(filename)
139
# For pre-2.0 logs, we need to hang on to the previous timestamp, as
140
# this will be used as the time the card was shown, in order to
141
# calculate the actual interval. (The timestamps for repetitions are
142
# when the card was graded, not when it was presented to the user.)
143
self.timestamp = None
144
self.previous_timestamp = None
145
self.lower_timestamp_limit = 1121021345 # 2005-07-10 21:49:05.
146
self.upper_timestamp_limit = time.time()
147
for line in self.logfile:
149
self._parse_line(line)
151
print "Ignoring error while parsing line:\n%s" % line
152
print traceback_string()
155
def _parse_line(self, line):
156
parts = line.rstrip().rsplit(" : ")
157
self.timestamp = int(time.mktime(time.strptime(parts[0],
158
"%Y-%m-%d %H:%M:%S")))
159
if not self.lower_timestamp_limit < self.timestamp < \
160
self.upper_timestamp_limit:
161
raise TypeError, "Ignoring impossible date %s" % parts[0]
162
if parts[1].startswith("Program started"):
163
# Parse version string. They typically look like:
164
# Mnemosyne 1.0-RC nt win32
165
# Mnemosyne 1.0 RC posix linux2
166
# Mnemosyne 1.1.1_debug3 posix linux2
167
# Mnemosyne 1.2.1 posix linux2
168
self.version = parts[2].replace("_", "-")
169
self.version_number = self.version.split()[1].split("-")[0]
170
self.database.log_started_program(self.timestamp, self.version)
171
elif parts[1].startswith("Scheduler"):
172
scheduler_name = parts[2]
173
self.database.log_started_scheduler(self.timestamp, scheduler_name)
174
elif parts[1].startswith("Loaded database"):
175
Loaded, database, scheduled, non_memorised, active = \
177
self.database.log_loaded_database(self.timestamp, int(scheduled),
178
int(non_memorised), int(active))
179
elif parts[1].startswith("New item"):
180
self._parse_new_item(parts[1])
181
elif parts[1].startswith("Imported item"):
182
self._parse_imported_item(parts[1])
183
elif parts[1].startswith("Deleted item"):
184
self._parse_deleted_item(parts[1])
185
elif parts[1].startswith("R "):
186
self._parse_repetition(parts[1])
187
elif parts[1].startswith("Saved database"):
188
Saved, database, scheduled, non_memorised, active = \
190
self.database.log_saved_database(self.timestamp, int(scheduled),
191
int(non_memorised), int(active))
192
elif parts[1].startswith("Program stopped"):
193
self.database.log_stopped_program(self.timestamp)
194
self.previous_timestamp = self.timestamp
196
def _parse_new_item(self, new_item_chunk):
197
New, item, id, grade, new_interval = new_item_chunk.split(" ")
198
if self.ids_to_parse and id not in self.ids_to_parse:
202
if grade >= 2 and self.version_number in self.versions_1_x_phase_1:
204
elif grade < 2 and self.version_number in self.versions_1_x_phase_2:
206
self.database.log_added_card(self.timestamp, id)
207
self.database.set_offset_last_rep(id, offset, last_rep=0)
208
self.database.update_card_after_log_import(id, self.timestamp, offset)
209
if grade >= 2 and self.version_number in \
210
self.versions_1_x_phase_1 + self.versions_1_x_phase_2:
211
self.database.log_repetition(self.timestamp, id, grade,
212
easiness=2.5, acq_reps=1, ret_reps=0, lapses=0,
213
acq_reps_since_lapse=1, ret_reps_since_lapse=0,
214
scheduled_interval=0, actual_interval=0, new_interval=\
215
int(new_interval), thinking_time=0, last_rep=0,
216
next_rep=0, scheduler_data=0)
218
def _parse_imported_item(self, imported_item_chunk):
219
Imported, item, id, grade, ret_reps, last_rep, next_rep, interval \
220
= imported_item_chunk.split(" ")
221
if self.ids_to_parse and id not in self.ids_to_parse:
223
# Check if we've seen this card before. If so, we are restoring from a
224
# backup and don't need to update the database.
226
offset, last_rep = self.database.offset_last_rep(id)
230
self.database.log_added_card(self.timestamp, id)
231
self.database.set_offset_last_rep(id, offset, last_rep)
232
self.database.update_card_after_log_import(id, self.timestamp,
235
def _parse_deleted_item(self, deleted_item_chunk):
236
Deleted, item, id = deleted_item_chunk.split(" ")
237
if self.ids_to_parse and id not in self.ids_to_parse:
239
# Only log the deletion if we've seen the card before, as a safeguard
240
# against corrupt logs.
242
offset, last_rep = self.database.offset_last_rep(id)
243
self.database.log_deleted_card(self.timestamp, id)
247
def _parse_repetition(self, repetition_chunk):
249
blocks = repetition_chunk.split(" | ")
250
R, id, grade, easiness = blocks[0].split(" ")
251
if self.ids_to_parse and id not in self.ids_to_parse:
254
easiness = float(easiness)
255
acq_reps, ret_reps, lapses, acq_reps_since_lapse, \
256
ret_reps_since_lapse = blocks[1].split(" ")
257
acq_reps, ret_reps = int(acq_reps), int(ret_reps)
259
acq_reps_since_lapse = int(acq_reps_since_lapse)
260
ret_reps_since_lapse = int(ret_reps_since_lapse)
261
scheduled_interval, actual_interval = blocks[2].split(" ")
262
scheduled_interval = int(scheduled_interval)
263
actual_interval = int(actual_interval)
264
new_interval, noise = blocks[3].split(" ")
265
new_interval = int(new_interval) + int(noise)
266
thinking_time = round(float(blocks[4]))
267
# Deal with interval data for pre 2.0 logs.
268
if self.version_number in \
269
self.versions_1_x_phase_1 + self.versions_1_x_phase_2:
271
# Calculate 'actual_interval' and update 'last_rep'.
272
# (Note: 'last_rep' is the time the card was graded, not when
274
offset, last_rep = self.database.offset_last_rep(id)
276
actual_interval = self.previous_timestamp - last_rep
279
self.database.set_offset_last_rep(id, offset, self.timestamp)
281
# Make sure the card exists (e.g. due to missing logs).
284
self.database.log_added_card(self.timestamp, id)
285
self.database.set_offset_last_rep(id, offset, last_rep=0)
286
self.database.update_card_after_log_import(id, self.timestamp, offset)
287
# Convert days to seconds.
288
scheduled_interval *= DAY
290
# Take offset into account.
293
acq_reps_since_lapse += offset
295
self.database.log_repetition(self.timestamp, id, grade, easiness,
296
acq_reps, ret_reps, lapses, acq_reps_since_lapse,
297
ret_reps_since_lapse, scheduled_interval, actual_interval,
298
new_interval, thinking_time, last_rep=0, next_rep=0,