diff --git a/gitstats b/gitstats index c71b0e4..1c5ff38 100755 --- a/gitstats +++ b/gitstats @@ -15,8 +15,8 @@ import time import zlib if sys.version_info < (2, 6): - print >> sys.stderr, "Python 2.6 or higher is required for gitstats" - sys.exit(1) + print >> sys.stderr, "Python 2.6 or higher is required for gitstats" + sys.exit(1) from multiprocessing import Pool @@ -34,1146 +34,1160 @@ time_start = time.time() # environment variable "GNUPLOT" gnuplot_cmd = 'gnuplot' if 'GNUPLOT' in os.environ: - gnuplot_cmd = os.environ['GNUPLOT'] + gnuplot_cmd = os.environ['GNUPLOT'] conf = { - 'max_domains': 10, - 'max_ext_length': 10, - 'style': 'gitstats.css', - 'max_authors': 20, - 'authors_top': 5, - 'commit_begin': '', - 'commit_end': 'HEAD', - 'linear_linestats': 1, - 'project_name': '', - 'processes': 8, - 'start_date': '' + 'max_domains': 10, + 'max_ext_length': 10, + 'style': 'gitstats.css', + 'max_authors': 20, + 'authors_top': 5, + 'commit_begin': '', + 'commit_end': 'HEAD', + 'linear_linestats': 1, + 'project_name': '', + 'processes': 8, + 'start_date': '', + 'exclude_paths': [], } def getpipeoutput(cmds, quiet = False): - global exectime_external - start = time.time() - if not quiet and ON_LINUX and os.isatty(1): - print '>> ' + ' | '.join(cmds), - sys.stdout.flush() - p = subprocess.Popen(cmds[0], stdout = subprocess.PIPE, shell = True) - processes=[p] - for x in cmds[1:]: - p = subprocess.Popen(x, stdin = p.stdout, stdout = subprocess.PIPE, shell = True) - processes.append(p) - output = p.communicate()[0] - for p in processes: - p.wait() - end = time.time() - if not quiet: - if ON_LINUX and os.isatty(1): - print '\r', - print '[%.5f] >> %s' % (end - start, ' | '.join(cmds)) - exectime_external += (end - start) - return output.rstrip('\n') + global exectime_external + start = time.time() + if not quiet and ON_LINUX and os.isatty(1): + print '>> ' + ' | '.join(cmds), + sys.stdout.flush() + p = subprocess.Popen(cmds[0], stdout = subprocess.PIPE, shell = True) + processes=[p] + for x in cmds[1:]: + p = subprocess.Popen(x, stdin = p.stdout, stdout = subprocess.PIPE, shell = True) + processes.append(p) + output = p.communicate()[0] + for p in processes: + p.wait() + end = time.time() + if not quiet: + if ON_LINUX and os.isatty(1): + print '\r', + print '[%.5f] >> %s' % (end - start, ' | '.join(cmds)) + exectime_external += (end - start) + return output.rstrip('\n') def getlogrange(defaultrange = 'HEAD', end_only = True): - commit_range = getcommitrange(defaultrange, end_only) - if len(conf['start_date']) > 0: - return '--since="%s" "%s"' % (conf['start_date'], commit_range) - return commit_range + commit_range = getcommitrange(defaultrange, end_only) + if len(conf['start_date']) > 0: + return '--since="%s" "%s"' % (conf['start_date'], commit_range) + return commit_range def getcommitrange(defaultrange = 'HEAD', end_only = False): - if len(conf['commit_end']) > 0: - if end_only or len(conf['commit_begin']) == 0: - return conf['commit_end'] - return '%s..%s' % (conf['commit_begin'], conf['commit_end']) - return defaultrange + if len(conf['commit_end']) > 0: + if end_only or len(conf['commit_begin']) == 0: + return conf['commit_end'] + return '%s..%s' % (conf['commit_begin'], conf['commit_end']) + return defaultrange def getkeyssortedbyvalues(dict): - return map(lambda el : el[1], sorted(map(lambda el : (el[1], el[0]), dict.items()))) + return map(lambda el : el[1], sorted(map(lambda el : (el[1], el[0]), dict.items()))) # dict['author'] = { 'commits': 512 } - ...key(dict, 'commits') def getkeyssortedbyvaluekey(d, key): - return map(lambda el : el[1], sorted(map(lambda el : (d[el][key], el), d.keys()))) + return map(lambda el : el[1], sorted(map(lambda el : (d[el][key], el), d.keys()))) def getstatsummarycounts(line): - numbers = re.findall('\d+', line) - if len(numbers) == 1: - # neither insertions nor deletions: may probably only happen for "0 files changed" - numbers.append(0); - numbers.append(0); - elif len(numbers) == 2 and line.find('(+)') != -1: - numbers.append(0); # only insertions were printed on line - elif len(numbers) == 2 and line.find('(-)') != -1: - numbers.insert(1, 0); # only deletions were printed on line - return numbers + numbers = re.findall('\d+', line) + if len(numbers) == 1: + # neither insertions nor deletions: may probably only happen for "0 files changed" + numbers.append(0); + numbers.append(0); + elif len(numbers) == 2 and line.find('(+)') != -1: + numbers.append(0); # only insertions were printed on line + elif len(numbers) == 2 and line.find('(-)') != -1: + numbers.insert(1, 0); # only deletions were printed on line + return numbers VERSION = 0 def getversion(): - global VERSION - if VERSION == 0: - gitstats_repo = os.path.dirname(os.path.abspath(__file__)) - VERSION = getpipeoutput(["git --git-dir=%s/.git --work-tree=%s rev-parse --short %s" % - (gitstats_repo, gitstats_repo, getcommitrange('HEAD').split('\n')[0])]) - return VERSION + global VERSION + if VERSION == 0: + gitstats_repo = os.path.dirname(os.path.abspath(__file__)) + VERSION = getpipeoutput(["git --git-dir=%s/.git --work-tree=%s rev-parse --short %s" % + (gitstats_repo, gitstats_repo, getcommitrange('HEAD').split('\n')[0])]) + return VERSION def getgitversion(): - return getpipeoutput(['git --version']).split('\n')[0] + return getpipeoutput(['git --version']).split('\n')[0] def getgnuplotversion(): - return getpipeoutput(['%s --version' % gnuplot_cmd]).split('\n')[0] + return getpipeoutput(['%s --version' % gnuplot_cmd]).split('\n')[0] def getnumoffilesfromrev(time_rev): - """ - Get number of files changed in commit - """ - time, rev = time_rev - return (int(time), rev, int(getpipeoutput(['git ls-tree -r --name-only "%s"' % rev, 'wc -l']).split('\n')[0])) + """ + Get number of files changed in commit + """ + time, rev = time_rev + return (int(time), rev, int(getpipeoutput(['git ls-tree -r --name-only "%s"' % rev, 'wc -l']).split('\n')[0])) def getnumoflinesinblob(ext_blob): - """ - Get number of lines in blob - """ - ext, blob_id = ext_blob - return (ext, blob_id, int(getpipeoutput(['git cat-file blob %s' % blob_id, 'wc -l']).split()[0])) + """ + Get number of lines in blob + """ + ext, blob_id = ext_blob + return (ext, blob_id, int(getpipeoutput(['git cat-file blob %s' % blob_id, 'wc -l']).split()[0])) class DataCollector: - """Manages data collection from a revision control repository.""" - def __init__(self): - self.stamp_created = time.time() - self.cache = {} - self.total_authors = 0 - self.activity_by_hour_of_day = {} # hour -> commits - self.activity_by_day_of_week = {} # day -> commits - self.activity_by_month_of_year = {} # month [1-12] -> commits - self.activity_by_hour_of_week = {} # weekday -> hour -> commits - self.activity_by_hour_of_day_busiest = 0 - self.activity_by_hour_of_week_busiest = 0 - self.activity_by_year_week = {} # yy_wNN -> commits - self.activity_by_year_week_peak = 0 - - self.authors = {} # name -> {commits, first_commit_stamp, last_commit_stamp, last_active_day, active_days, lines_added, lines_removed} - - self.total_commits = 0 - self.total_files = 0 - self.authors_by_commits = 0 - - # domains - self.domains = {} # domain -> commits - - # author of the month - self.author_of_month = {} # month -> author -> commits - self.author_of_year = {} # year -> author -> commits - self.commits_by_month = {} # month -> commits - self.commits_by_year = {} # year -> commits - self.lines_added_by_month = {} # month -> lines added - self.lines_added_by_year = {} # year -> lines added - self.lines_removed_by_month = {} # month -> lines removed - self.lines_removed_by_year = {} # year -> lines removed - self.first_commit_stamp = 0 - self.last_commit_stamp = 0 - self.last_active_day = None - self.active_days = set() - - # lines - self.total_lines = 0 - self.total_lines_added = 0 - self.total_lines_removed = 0 - - # size - self.total_size = 0 - - # timezone - self.commits_by_timezone = {} # timezone -> commits - - # tags - self.tags = {} - - self.files_by_stamp = {} # stamp -> files - - # extensions - self.extensions = {} # extension -> files, lines - - # line statistics - self.changes_by_date = {} # stamp -> { files, ins, del } - - ## - # This should be the main function to extract data from the repository. - def collect(self, dir): - self.dir = dir - if len(conf['project_name']) == 0: - self.projectname = os.path.basename(os.path.abspath(dir)) - else: - self.projectname = conf['project_name'] - - ## - # Load cacheable data - def loadCache(self, cachefile): - if not os.path.exists(cachefile): - return - print 'Loading cache...' - f = open(cachefile, 'rb') - try: - self.cache = pickle.loads(zlib.decompress(f.read())) - except: - # temporary hack to upgrade non-compressed caches - f.seek(0) - self.cache = pickle.load(f) - f.close() - - ## - # Produce any additional statistics from the extracted data. - def refine(self): - pass - - ## - # : get a dictionary of author - def getAuthorInfo(self, author): - return None - - def getActivityByDayOfWeek(self): - return {} - - def getActivityByHourOfDay(self): - return {} - - # : get a dictionary of domains - def getDomainInfo(self, domain): - return None - - ## - # Get a list of authors - def getAuthors(self): - return [] - - def getFirstCommitDate(self): - return datetime.datetime.now() - - def getLastCommitDate(self): - return datetime.datetime.now() - - def getStampCreated(self): - return self.stamp_created - - def getTags(self): - return [] - - def getTotalAuthors(self): - return -1 - - def getTotalCommits(self): - return -1 - - def getTotalFiles(self): - return -1 - - def getTotalLOC(self): - return -1 - - ## - # Save cacheable data - def saveCache(self, cachefile): - print 'Saving cache...' - tempfile = cachefile + '.tmp' - f = open(tempfile, 'wb') - #pickle.dump(self.cache, f) - data = zlib.compress(pickle.dumps(self.cache)) - f.write(data) - f.close() - try: - os.remove(cachefile) - except OSError: - pass - os.rename(tempfile, cachefile) + """Manages data collection from a revision control repository.""" + def __init__(self): + self.stamp_created = time.time() + self.cache = {} + self.total_authors = 0 + self.activity_by_hour_of_day = {} # hour -> commits + self.activity_by_day_of_week = {} # day -> commits + self.activity_by_month_of_year = {} # month [1-12] -> commits + self.activity_by_hour_of_week = {} # weekday -> hour -> commits + self.activity_by_hour_of_day_busiest = 0 + self.activity_by_hour_of_week_busiest = 0 + self.activity_by_year_week = {} # yy_wNN -> commits + self.activity_by_year_week_peak = 0 + + self.authors = {} # name -> {commits, first_commit_stamp, last_commit_stamp, last_active_day, active_days, lines_added, lines_removed} + + self.total_commits = 0 + self.total_files = 0 + self.authors_by_commits = 0 + + # domains + self.domains = {} # domain -> commits + + # author of the month + self.author_of_month = {} # month -> author -> commits + self.author_of_year = {} # year -> author -> commits + self.commits_by_month = {} # month -> commits + self.commits_by_year = {} # year -> commits + self.lines_added_by_month = {} # month -> lines added + self.lines_added_by_year = {} # year -> lines added + self.lines_removed_by_month = {} # month -> lines removed + self.lines_removed_by_year = {} # year -> lines removed + self.first_commit_stamp = 0 + self.last_commit_stamp = 0 + self.last_active_day = None + self.active_days = set() + + # lines + self.total_lines = 0 + self.total_lines_added = 0 + self.total_lines_removed = 0 + + # size + self.total_size = 0 + + # timezone + self.commits_by_timezone = {} # timezone -> commits + + # tags + self.tags = {} + + self.files_by_stamp = {} # stamp -> files + + # extensions + self.extensions = {} # extension -> files, lines + + # line statistics + self.changes_by_date = {} # stamp -> { files, ins, del } + + ## + # This should be the main function to extract data from the repository. + def collect(self, dir): + self.dir = dir + if len(conf['project_name']) == 0: + self.projectname = os.path.basename(os.path.abspath(dir)) + else: + self.projectname = conf['project_name'] + + ## + # Load cacheable data + def loadCache(self, cachefile): + if not os.path.exists(cachefile): + return + print 'Loading cache...' + f = open(cachefile, 'rb') + try: + self.cache = pickle.loads(zlib.decompress(f.read())) + except: + # temporary hack to upgrade non-compressed caches + f.seek(0) + self.cache = pickle.load(f) + f.close() + + ## + # Produce any additional statistics from the extracted data. + def refine(self): + pass + + ## + # : get a dictionary of author + def getAuthorInfo(self, author): + return None + + def getActivityByDayOfWeek(self): + return {} + + def getActivityByHourOfDay(self): + return {} + + # : get a dictionary of domains + def getDomainInfo(self, domain): + return None + + ## + # Get a list of authors + def getAuthors(self): + return [] + + def getFirstCommitDate(self): + return datetime.datetime.now() + + def getLastCommitDate(self): + return datetime.datetime.now() + + def getStampCreated(self): + return self.stamp_created + + def getTags(self): + return [] + + def getTotalAuthors(self): + return -1 + + def getTotalCommits(self): + return -1 + + def getTotalFiles(self): + return -1 + + def getTotalLOC(self): + return -1 + + ## + # Save cacheable data + def saveCache(self, cachefile): + print 'Saving cache...' + tempfile = cachefile + '.tmp' + f = open(tempfile, 'wb') + #pickle.dump(self.cache, f) + data = zlib.compress(pickle.dumps(self.cache)) + f.write(data) + f.close() + try: + os.remove(cachefile) + except OSError: + pass + os.rename(tempfile, cachefile) class GitDataCollector(DataCollector): - def collect(self, dir): - DataCollector.collect(self, dir) - - self.total_authors += int(getpipeoutput(['git shortlog -s %s' % getlogrange(), 'wc -l'])) - #self.total_lines = int(getoutput('git-ls-files -z |xargs -0 cat |wc -l')) - - # tags - lines = getpipeoutput(['git show-ref --tags']).split('\n') - for line in lines: - if len(line) == 0: - continue - (hash, tag) = line.split(' ') - - tag = tag.replace('refs/tags/', '') - output = getpipeoutput(['git log "%s" --pretty=format:"%%at %%aN" -n 1' % hash]) - if len(output) > 0: - parts = output.split(' ') - stamp = 0 - try: - stamp = int(parts[0]) - except ValueError: - stamp = 0 - self.tags[tag] = { 'stamp': stamp, 'hash' : hash, 'date' : datetime.datetime.fromtimestamp(stamp).strftime('%Y-%m-%d'), 'commits': 0, 'authors': {} } - - # collect info on tags, starting from latest - tags_sorted_by_date_desc = map(lambda el : el[1], reversed(sorted(map(lambda el : (el[1]['date'], el[0]), self.tags.items())))) - prev = None - for tag in reversed(tags_sorted_by_date_desc): - cmd = 'git shortlog -s "%s"' % tag - if prev != None: - cmd += ' "^%s"' % prev - output = getpipeoutput([cmd]) - if len(output) == 0: - continue - prev = tag - for line in output.split('\n'): - parts = re.split('\s+', line, 2) - commits = int(parts[1]) - author = parts[2] - self.tags[tag]['commits'] += commits - self.tags[tag]['authors'][author] = commits - - # Collect revision statistics - # Outputs "