forked from stefan-peng/oboeta
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patholeitner.py
executable file
·254 lines (208 loc) · 10.4 KB
/
oleitner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
#!/usr/bin/env python3
# Select Lines from a CSV File According to a Leitner Scheduler
# Written in 2012 by 伴上段
#
# To the extent possible under law, the author(s) have dedicated all copyright
# and related and neighboring rights to this software to the public domain
# worldwide. This software is distributed without any warranty.
#
# You should have received a copy of the CC0 Public Domain Dedication along
# with this software. If not, see
# <http://creativecommons.org/publicdomain/zero/1.0/>.
import argparse, csv, datetime, itertools, os.path, random, sys
class TRandomSelector(object):
def __init__(self, capacity):
self.capacity = int(capacity)
self.sample = []
self.counter = 0
if not self.capacity:
self.Add = (lambda me: None)
def __iter__(self):
for selected in self.sample:
yield selected
def Add(self, o):
self.counter += 1
if self.counter <= self.capacity:
self.sample.append(o)
else:
tag = random.randint(0, self.counter)
if tag < self.capacity:
self.sample[tag] = o
class TBucket(object):
__slots__ = ("id", "size", "first", "next", "time_offset")
def __init__(self, myid, first, next, time_offset):
self.id = myid
self.size = 0
self.first = first
self.next = next
self.time_offset = time_offset
super().__init__()
def Add(self, line, dateandtime):
line.date = dateandtime + self.time_offset
self.size += 1
def RemoveOne(self):
self.size -= 1
class TLine(object):
__slots__ = ("id", "date", "bucket", "fields")
def __init__(self, myid, dateandtime, bucket, fields=None):
self.id = myid
self.date = dateandtime
self.bucket = bucket
self.fields = fields
super().__init__()
def Demote(self, dateandtime):
self.bucket.RemoveOne()
self.bucket = self.bucket.first
self.bucket.Add(self, dateandtime)
def Promote(self, dateandtime):
self.bucket.RemoveOne()
self.bucket = self.bucket.next
self.bucket.Add(self, dateandtime)
def Main(output, num, new, bucketdelays, logfile, deckfile, field_sep, date_format, show_buckets):
# Check arguments for illegal values.
ret = 0
if num < 0:
sys.stderr.write("The number of lines cannot be negative.\n")
ret = 2
if new < 0:
sys.stderr.write("The number of lines cannot be negative.\n")
ret = 2
if not os.path.exists(deckfile):
sys.stderr.write("The deck " + deckfile + " does not exist.\n")
ret = 2
if not os.path.exists(logfile):
sys.stderr.write("The log " + logfile + " does not exist.\n")
ret = 2
if any(bucket <= 0 for bucket in bucketdelays):
sys.stderr.write("Zero and negative bucket delays are not allowed.\n")
ret = 2
if ret != 0:
return ret
# Create the list of buckets from the client-specified delays.
bucket = TBucket(0, None, None, datetime.timedelta(days=0))
first_bucket = bucket
bucket.next = bucket
bucket.first = bucket
for bucket_id, delay in enumerate(bucketdelays, start=1):
bucket.next = TBucket(bucket_id, first_bucket, None, datetime.timedelta(days=delay))
bucket = bucket.next
bucket.next = bucket
# Process the log file. Create a TLine for each new unique ID encountered
# and track its progress as it hops across buckets.
lines = {}
with open(logfile, 'r') as logf:
for lineno, fields in enumerate(csv.reader(logf, delimiter=field_sep)):
if len(fields) != 3:
sys.stderr.write(logfile + ":" + str(lineno) + ": invalid number of fields: " + str(len(fields)) + "\n")
return 3
try:
date_time = datetime.datetime.strptime(fields[1], date_format)
except ValueError as e:
sys.stderr.write(logfile + ":" + str(lineno) + ": invalid date format: " + str(e) + "\n")
return 3
entry = lines.get(fields[0], None)
if entry is None:
entry = TLine(fields[0], None, first_bucket)
lines[fields[0]] = entry
if fields[2] == '+':
entry.Promote(date_time)
elif fields[2] == '-':
entry.Demote(date_time)
else:
sys.stderr.write(logfile + ":" + str(lineno) + ": invalid mutation in third field: must be + or -\n")
return 3
# Process the lines from the deck. Match each line with its record in the
# lines dictionary (if such a record exists). Lines lacking log entries are
# marked as "new" by setting their buckets to None.
now = datetime.datetime.now()
with open(deckfile, 'r') as deckf:
for lineno, fields in enumerate(csv.reader(deckf, delimiter=field_sep)):
if len(fields) == 0:
continue
if fields[0] not in lines:
lines[fields[0]] = TLine(fields[0], now, None, fields)
else:
lines[fields[0]].fields = fields
# Early out: If we only need to show the lines and their bucket numbers, then
# do so now and exit.
if show_buckets:
for line in lines.values():
output.write(field_sep.join(itertools.chain((str(line.bucket.id if line.bucket is not None else "-1"),), line.fields)) + "\n")
return 0
# Randomly select due lines that have already been reviewed (i.e., lines with
# records in the log file) and new lines (lines lacking such records).
# Combine the results and write them to output.
due_selector, new_selector = TRandomSelector(num), TRandomSelector(new)
for line in (line for line in lines.values() if line.date <= now and line.fields is not None):
(due_selector if line.bucket else new_selector).Add(line)
for line in itertools.chain(due_selector, new_selector):
output.write(field_sep.join(line.fields) + "\n")
return 0
if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=""" Select CSV-formatted lines from the specified file (called the "deck")
according to a basic Leitner scheduler. Both the deck and the specified log
file must be CSV files with the same field separator character, which is
specified via -s.
This program is useful for scheduling reviews of flashcards stored within
CSV files.
about Leitner scheduling:
A Leitner scheduler sorts items into numbered "buckets". Items in the first
bucket (the one with the lowest numeric ID) are always due now. Items in
other buckets might be due now. If an item is "successfully reviewed"
(the meaning of that phrase is domain-specific), then it moves to the next
bucket and it scheduled for a later date. If an item is not successfully
reviewed, then the item moves back to the first bucket. Items in the
last bucket (the one with the highest numeric ID) that are successfully
reviewed stay within the last bucket.
Each bucket has an associated delay in days. When an item is moved into a
bucket, the item is scheduled for the current date plus that bucket's delay.
(The first bucket is implicitly defined and has no delay.)
formatting:
This program treats the first field of each nonempty line from the deck as
that line's unique ID. (If multiple lines have the same ID, then the last
line wins.) It uses the ID to match the line with records scanned from the
log file. Each log file line must have the following format:
<ID> <field-separator> <timestamp> <field-separator> <+>|<->
where <ID> is the unique ID of the line associated with the record,
<field-separator> is the CSV field separator, <timestamp> is the record's
timestamp (you can modify its format via the -f option), and <+> and <-> are
the '+' and '-' characters. '+' indicates that the line was successfully
reviewed at the specified time, whereas '-' indicates that the line wasn't
successfully reviewed at the specified time. What "successfully reviewed"
means is domain-specific.
bucketdelay is a delay in days. It must be a positive integer or zero.
The first bucket is implicitly defined with no delay, so you don't have
to specify a delay for it.
output:
This program prints randomly-selected, due lines to stdout.""", epilog="""examples:
$ oleitner flashcards.txt flashcards.log 1 3 7 14
Schedule lines from flashcards.txt using flashcards.log as the log file.
This uses five buckets with delays of zero, one, three, seven, and fourteen
days, respectively.
$ oleitner -n 20 -e 10 flashcards.txt flashcards.log 1 3 7 14
Schedule at most 30 lines (at most 20 lines with entries in flashcards.log
and at most 10 lines without such entries) from flashcards.txt using
flashcards.log as the log file. This uses the same buckets as in the
previous example.
$ oleitner -s ',' flashcards.txt flashcards.log 1 3 7 14
Same as the first example, but use a comma as the CSV field separator
instead of tab characters.
$ oleitner -f '%Y/%m/%d' flashcards.txt flashcards.log 1 3 7 14
Same as the first example, but use the log timestamp format '%Y/%m/%d'
instead of the default.
$ oleitner -b flashcards.txt flashcards.log 1 3 7 14
Same as the first example, but skip line selection and dump all lines from
flashcards.txt to stdout with their bucket numbers prefixed to them.
(-1 indicates that the line has no records in the log file.)
""")
parser.add_argument("-n", "--num-lines", type=int, default=10, dest="num", help="the maximum number of lines with log records to select (default: 10)")
parser.add_argument("-e", "--num-new-lines", type=int, default=4, dest="new", help="the maximum number of lines without log records to select (default: 4)")
parser.add_argument("-s", "--field-sep", default="\t", help="the CSV field separator (default: \\t)")
parser.add_argument("-f", "--date-format", default="%Y年%m月%d日", help="the format of dates/timestamps in the log file (uses date/strftime flags, default: %%Y年%%m月%%d日)")
parser.add_argument("-b", "--show-buckets", default=False, action="store_true", help="just dump the lines to standard output along with their current bucket numbers (the bucket number is the first field of each line in the output, -1 for lines without log entries)")
parser.add_argument("deckfile", help="a CSV-formatted file containing scheduled lines")
parser.add_argument("logfile", help="a CSV-formatted file containing records for the deck's lines")
parser.add_argument("bucketdelay", type=int, nargs="+", help="the number of days to add to a line's due date when it's moved to the corresponding Leitner bucket")
args = parser.parse_args()
ret = Main(sys.stdout, args.num, args.new, args.bucketdelay, args.logfile, args.deckfile, args.field_sep, args.date_format, args.show_buckets)
sys.exit(ret)