forked from chuanconggao/PrefixSpan-py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
prefixspan-cli
executable file
·114 lines (82 loc) · 3.18 KB
/
prefixspan-cli
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#! /usr/bin/env python3
"""
Usage:
prefixspan-cli (frequent | top-k) <threshold> [options] [<file>]
prefixspan-cli --help
Options:
--text Treat each item as text instead of integer.
--closed Return only closed patterns.
--generator Return only generator patterns.
--key=<key> Custom key function. [default: ]
Must be a Python function in form of "lambda patt, matches: ...", returning an integer value.
--bound=<bound> The upper-bound function of the respective key function. When unspecified, the same key function is used. [default: ]
Must be no less than the key function, i.e. bound(patt, matches) ≥ key(patt, matches).
Must be anti-monotone, i.e. for patt1 ⊑ patt2, bound(patt1, matches1) ≥ bound(patt2, matches2).
--filter=<filter> Custom filter function. [default: ]
Must be a Python function in form of "lambda patt, matches: ...", returning a boolean value.
--minlen=<minlen> Minimum length of patterns. [default: 1]
--maxlen=<maxlen> Maximum length of patterns. [default: 1000]
"""
from typing import *
import sys
from docopt import docopt
from prefixspan import PrefixSpan
from extratools.dicttools import invert, remap
from extratools.printtools import print2
def checkArg(arg, cond):
# type: (str, Callable[[int], bool]) -> int
try:
val = int(argv[arg])
if not cond(val):
raise ValueError
except ValueError:
print2("ERROR: Cannot parse {}.".format(arg))
print2(__doc__)
sys.exit(1)
return val
def checkFunc(arg):
# type: (str) -> Callable[..., bool]
try:
return eval(argv[arg])
except:
print2("ERROR: Cannot parse {}.".format(arg))
print2(__doc__)
sys.exit(1)
if __name__ == "__main__":
argv = docopt(__doc__)
istext = argv["--text"]
docs = [
line.strip().split(' ')
for line in (open(argv["<file>"]) if argv["<file>"] else sys.stdin)
]
if istext:
wordmap = {} # type: Dict[str, int]
db = [list(remap(doc, wordmap)) for doc in docs]
else:
db = [
[int(w) for w in doc]
for doc in docs
]
ps = PrefixSpan(db)
func = ps.frequent if argv["frequent"] else ps.topk
key = checkFunc("--key") if argv["--key"] else None
bound = checkFunc("--bound") if argv["--bound"] else key
threshold = checkArg("<threshold>", lambda v: key is not None or v > 0)
closed = argv["--closed"]
generator = argv["--generator"]
filter = checkFunc("--filter") if argv["--filter"] else None
if argv["--minlen"]:
ps.minlen = checkArg("--minlen", lambda v: v > 0)
if argv["--maxlen"]:
ps.maxlen = checkArg("--maxlen", lambda v: v >= ps.minlen)
if istext:
invwordmap = invert(wordmap)
for freq, patt in func(
threshold, closed=closed, generator=generator,
key=key, bound=bound,
filter=filter
):
print("{} : {}".format(' '.join(
(invwordmap[i] for i in patt) if istext
else (str(i) for i in patt)
), freq))