forked from expectocode/telegram-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmostactiveusers.py
executable file
·149 lines (130 loc) · 5.51 KB
/
mostactiveusers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python3
"""
A program to plot a pie chart of the most active users in a Telegram chat
"""
import argparse
from json import loads
from os import path
from collections import defaultdict
import matplotlib.pyplot as plt
from datetime import date,datetime
from operator import itemgetter
def parse_args():
parser = argparse.ArgumentParser(description=
"Create a pie chart showing the most active users in a Telegram chat")
required = parser.add_argument_group('required arguments')
required.add_argument('-f','--file',
help='the jsonl chatlog file to analyse',
required = True
)
parser.add_argument(
'-o', '--output-folder',
help='the folder to save the pie chart image in.'
'Using this option will make the graph not display on screen.')
parser.add_argument(
'-s','--figure-size',
help='the size of the figure shown or saved (X and Y size).'
'Choose an appropriate value for your screen size. Default 12 8.',
nargs=2,type=int,default = [12,8]
)
parser.add_argument(
'-m','--minimum-percentage',
help='the minimum percentage of activity a person must contribute '
'to get their own slice of the pie chart. Default 2',
type=float,default=2
)
parser.add_argument(
'-d','--date-range',
help='the range of dates you want to look at data between. '
'Must be in format YYYY-MM-DD YYYY-MM-DD with the first date '
'the start of the range, and the second the end. Example: '
"-d '2017-11-20 2017-05-15'. Make sure you don't put a day "
'that is too high for the month eg 30th February.',
default="1000-01-01 4017-01-01"
#hopefully no chatlogs contain these dates :p
)
return parser.parse_args()
def get_dates(arg_dates):
if " " not in arg_dates:
print("You must put a space between start and end dates")
exit()
daterange = arg_dates.split()
start_date = datetime.strptime(daterange[0], "%Y-%m-%d").date()
end_date = datetime.strptime(daterange[1], "%Y-%m-%d").date()
return (start_date,end_date)
def extract_infos(event):
text_date = date.fromtimestamp(event['date'])
text_length = len(event['text'])
text_userid= event['from']['peer_id']
text_printname = event['from']['print_name']
return text_date,text_length,text_userid,text_printname
def make_ddict(jsonfile,start,end):
"""
Make a defaultdict with user IDs as keys and char count as values
Return (dict of IDs -> names, total chars, defaultdict)
"""
names = {} #dict
counter = defaultdict(int)
total_datapoints = 0
events = (loads(line) for line in jsonfile)
messages = (extract_infos(event) for event in events if 'text' in event)
messages = ((when,what,uid,who) for (when,what,uid,who) in messages if when >= start and when <= end)
for (msgdate,textlength,userid,printname) in messages:
total_datapoints += textlength
if str(userid) not in names:
#this code assumes that chatlog has most recent events first
#which is default for telegram-history-dumper
names[str(userid)] = printname
if printname == "":
names[str(userid)] = str(userid)
counter[userid] += textlength
return names,total_datapoints,counter
def annotate_figure(filename):
plt.title("Most active users in {} by chars sent".format(filename), y=1.05)
plt.axis('equal')
#so it plots as a circle
def make_trimmed_ddict(counter,total_datapoints,names,min_percent):
trimmedCounter = defaultdict(int)
#find percentile to start adding people to "other" at
min_chars = (min_percent/100) * total_datapoints
for person, frequency in counter.items():
if frequency < min_chars:
trimmedCounter["other"] += frequency
else:
if names[str(person)] == "other":
print("Someone in this chat is called 'other'. "
"They will be absorbed into the 'other' pie slice.")
trimmedCounter[names[str(person)]] = frequency
return trimmedCounter
def main():
"""
main function
"""
args = parse_args()
filepath = args.file
savefolder = args.output_folder
figure_size = (args.figure_size[0],args.figure_size[1])
start_date,end_date = get_dates(args.date_range)
other_percent = args.minimum_percentage
#default 2
#anyone who sends less than this percentage of the total is 'other'
filename = path.splitext(path.split(filepath)[-1])[0]
#make filename just the name of the file, with no leading directories and no extension
with open(filepath, 'r') as jsonfile:
names,total_datapoints,counter = make_ddict(jsonfile,start_date,end_date)
trimmedCounter = make_trimmed_ddict(counter,total_datapoints,names,other_percent)
sortedCounter = sorted(trimmedCounter.items(), key=itemgetter(1))
print(sortedCounter)
freqList = list(zip(*sortedCounter))
plt.figure(figsize=figure_size)
plt.pie(freqList[1], labels=freqList[0], startangle=135)
annotate_figure(filename)
# plt.set_lw(10)
if savefolder is not None:
#if there is a given folder to save the figure in, save it there
plt.savefig("{}/Most active users in {}.png".format(savefolder, filename))
else:
#if a save folder was not specified, just open a window to display graph
plt.show()
if __name__ == "__main__":
main()