-
Notifications
You must be signed in to change notification settings - Fork 0
/
users.py
108 lines (84 loc) · 2.89 KB
/
users.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from __future__ import division
import collections
import config
import logging
import os.path
import util
logger = logging.getLogger("ghc.users")
class User:
def __init__(self, id):
self.id = int(id)
self.repos = set()
self.languages = []
def __eq__(self, other):
return self.id == other.id
def __str__(self):
return "({0.id})".format(self)
def __repr__(self):
return "User({0.id})".format(self)
def __lt__(self, other):
return self.id < other.id
def __gt__(self, other):
return self.id > other.id
def __hash__(self):
return self.id
def to_json(self):
# Use the underscore on certain attributes
# to force more desirable ordering.
return json.dumps({
'_id': self.id,
'_repos': self.repos,
'__languages': self.languages,
}, sort_keys=True, indent=2)
_user_watches = None
_repo_freqs = None
_test_ids = None
def get_user_watches():
"""
Returns an dict of user id keys mapped to a set
of repo ids being watched by that user
"""
path = os.path.join(config.CALC_DATA_PATH, 'user_watches.pickle')
global _user_watches
user_watches = _user_watches or util.load_pickle(path)
if user_watches:
_user_watches = user_watches
return user_watches
user_watches = collections.defaultdict(set)
for line in open(os.path.join(config.SRC_DATA_PATH, 'data.txt')):
k,v = line.rstrip().split(':')
user_watches[int(k)].add(int(v))
util.store_pickle(user_watches, path, debug=True)
_user_watches = user_watches
return user_watches
def get_repo_frequencies():
"""
Returns a map of repo id to (frequency, relative_freq) tuples.
"""
path = os.path.join(config.CALC_DATA_PATH, 'repo_frequencies1.pickle')
global _repo_freqs
repo_frequencies = _repo_freqs or util.load_pickle(path)
if repo_frequencies:
_repo_freqs = repo_frequencies
return repo_frequencies
user_watches = get_user_watches()
total_watches = sum(len(w) for w in user_watches.values())
logger.debug("Total watches is {0}".format(total_watches))
repo_frequencies = dict()
for repos in user_watches.values():
for watch in repos:
if not watch in repo_frequencies:
repo_frequencies[watch] = (1, 1/total_watches)
else:
freq = repo_frequencies[watch][0] + 1
repo_frequencies[watch] = (freq, freq/total_watches)
util.store_pickle(repo_frequencies, path, debug=True)
_repo_freqs = repo_frequencies
return repo_frequencies
def get_test_user_ids():
"""
Gets the user ids to guess repos for.
"""
global _test_ids
_test_ids = _test_ids or [int(line.rstrip()) for line in open(os.path.join(config.SRC_DATA_PATH, "test.txt"))]
return _test_ids