-
Notifications
You must be signed in to change notification settings - Fork 0
/
measure-activity.py
238 lines (188 loc) · 6.82 KB
/
measure-activity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#!/usr/bin/env python
# This is a basic parsing script that will get the last commit
# of each repository in the RSEpedia given the current date.
__author__ = "Vanessa Sochat"
__copyright__ = "Copyright 2023, Vanessa Sochat"
__license__ = "MPL 2.0"
from rse.main import Encyclopedia
from rse.utils.command import Command
from rse.utils.file import write_json, read_json
from datetime import datetime
import tempfile
import requests
import shutil
import subprocess
import shlex
import argparse
import sys
import os
today = datetime.now()
def clone(url, dest):
dest = os.path.join(dest, os.path.basename(url))
cmd = Command("git clone --depth 1 %s %s" % (url, dest))
cmd.execute()
if cmd.returncode != 0:
print("Issue cloning %s" % url)
return
return dest
def creation_date(repo, dest, filename):
"""
Get the creation date for a file.
"""
command = shlex.split(
f"git -C {dest} log --diff-filter=A --follow --format=%ct -1 -- {filename}"
)
res = subprocess.check_output(command)
ts = int(res.decode("utf-8").strip())
timestr = str(datetime.fromtimestamp(ts))
print(f"{repo.uid} was added to the database {timestr}")
return timestr
def last_commit(repo, dest):
"""
Get the date of last commit for a repository
"""
command = shlex.split(f"git -C {dest} log -1 --format=%ct")
res = subprocess.check_output(command)
ts = int(res.decode("utf-8").strip())
timestr = str(datetime.fromtimestamp(ts))
print(f"{repo.uid} was last updated {timestr}")
return timestr
def look_for_doi(repo):
"""
A lot of records have a DOI, and the publication date is a more
accurate assessment than when it was added to the rsepedia.
"""
url = "https://zenodo.org/api/records"
doi = repo.data.get("doi") or repo.data.get("data", {}).get("doi")
if not doi:
return doi, None
if isinstance(doi, list):
doi = doi[0]
# Rate limit is 5000/hour, we should be ok
if "zenodo" not in doi:
return doi, None
record_id = os.path.basename(doi).split(".")[-1]
zenodo_url = f"{url}/{record_id}"
r = requests.get(zenodo_url)
record = r.json()
# We can't get an API response
if "metadata" not in record:
return doi, None
published = record["metadata"]["publication_date"]
print(f"Found actual publication date {published} for {doi}")
return doi, published
def derive_creation_timestamps(pedia, outdir, added_json, settings_file):
"""
Parse the commit for when the file was added for each
This is our best estimate for a publication / release date
in the particular place it was parsed (give or take a week)
"""
print("🤓️ Looking for when each software repository was added to the database...")
print("If a zenodo DOI is provided, we can use that instead.")
repos = list(pedia.list())
total = len(repos)
# repository directory
repo_dir = os.path.dirname(settings_file)
added = {}
if os.path.exists(added_json):
added = read_json(added_json)
for i, reponame in enumerate(repos):
print(f"{i} of {total}", end="\r")
repo = pedia.get(reponame[0])
if repo.url in added:
continue
doi, published = look_for_doi(repo)
# Get relative path of the filename to the repository
filename = os.path.relpath(repo.filename, repo_dir)
created_at = creation_date(repo, repo_dir, filename)
added[repo.url] = {"created_at": created_at, "doi": doi, "published": published}
return added
def get_parser():
parser = argparse.ArgumentParser(
description="Research Software Encyclopedia Last Updated Analyzer",
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument(
"--settings-file",
dest="settings_file",
help="custom path to settings file.",
)
parser.add_argument(
"-o",
"--outdir",
help="Output directory for data.",
default=os.path.join(os.getcwd(), "data"),
)
return parser
def main():
p = get_parser()
args, extra = p.parse_known_args()
# Make sure output directory exists, organized by parsing date
timestamp = "%s-%s-%s" % (today.year, today.month, today.day)
outdir = os.path.abspath(os.path.join(args.outdir, timestamp))
if not os.path.exists(outdir):
print(f"Creating output directory {outdir}")
os.makedirs(outdir)
# Create a base temporary folder to work from
tempdir = tempfile.mkdtemp()
pedia = Encyclopedia(args.settings_file)
repos = list(pedia.list())
total = len(repos)
# keep track of last updated commit
meta = {}
# We will save results as we go, and use cached results for the day if exist
times_json = os.path.join(outdir, "last-commit-times.json")
if os.path.exists(times_json):
meta = read_json(times_json)
added_json = os.path.join(outdir, "rsepedia-times.json")
# This should be refactored into a nice class, this function is kind of janky
added = derive_creation_timestamps(pedia, outdir, added_json, args.settings_file)
write_json(added, added_json)
for i, reponame in enumerate(repos):
print(f"{i} of {total}", end="\r")
# Prepare to clone the repository
repo = pedia.get(reponame[0])
if not repo.url or repo.url in meta:
print(f"Skipping {repo.url}")
continue
dest = None
try:
# Try clone (and cut out early if not successful)
dest = clone(repo.url, tempdir)
if not dest:
continue
meta[repo.url] = last_commit(repo, dest)
except:
print(f"Issue with {repo.url}, skipping")
try:
if dest and os.path.exists(dest):
shutil.rmtree(dest)
except:
write_json(meta, times_json)
sys.exit(
"Likely too many files, check with ulimit -n and set with ulimit -n 4096"
)
# Save as we go
write_json(meta, times_json)
# One final save
write_json(meta, times_json)
# Write the combined results
results_json = os.path.join(outdir, "results.json")
results = {}
# We have dates for when added to the rsepedia and published
for url, timestamp in meta.items():
# The more accurate is likely zenodo
published = added[url]["created_at"]
zenodo_published = added[url]["published"]
if zenodo_published:
published = zenodo_published
results[url] = {
"last_commit": timestamp,
"added_rsepedia": added[url]["created_at"],
"zenodo_published": added[url]["published"],
"published": published,
"doi": added[url]["doi"],
}
write_json(results, results_json)
if __name__ == "__main__":
main()