From 90ad91cdbb329972900c145fc23e40dd84d1f449 Mon Sep 17 00:00:00 2001 From: lhbvvvvv <104072573+lhbvvvvv@users.noreply.github.com> Date: Mon, 25 Sep 2023 21:06:06 +0800 Subject: [PATCH] refactor: improve python kernel v2.0(#1375) * refactor: improve python kernel * organize open-digger python kernel code in a more object-oriented way * refactor:python->python_v2 * refactor: save python and create python_v2 Delete config file add __init__.py add __init__.py add .gitignore --- .gitignore | 6 +- python_v2/README.md | 52 ++ python_v2/config.py | 50 ++ python_v2/db/clickhouse_wrapper.py | 24 + python_v2/db/neo4j_wrapper.py | 23 + python_v2/label_data_utils.py | 175 +++++++ python_v2/metrics/__init__.py | 4 + python_v2/metrics/basic.py | 345 +++++++++++++ python_v2/metrics/chaoss.py | 795 +++++++++++++++++++++++++++++ python_v2/metrics/index.py | 275 ++++++++++ python_v2/metrics/related_users.py | 11 + python_v2/open_digger.py | 53 ++ 12 files changed, 1812 insertions(+), 1 deletion(-) create mode 100644 python_v2/README.md create mode 100644 python_v2/config.py create mode 100644 python_v2/db/clickhouse_wrapper.py create mode 100644 python_v2/db/neo4j_wrapper.py create mode 100644 python_v2/label_data_utils.py create mode 100644 python_v2/metrics/__init__.py create mode 100644 python_v2/metrics/basic.py create mode 100644 python_v2/metrics/chaoss.py create mode 100644 python_v2/metrics/index.py create mode 100644 python_v2/metrics/related_users.py create mode 100644 python_v2/open_digger.py diff --git a/.gitignore b/.gitignore index 702588245..f81b85efa 100644 --- a/.gitignore +++ b/.gitignore @@ -18,8 +18,12 @@ node_modules sample_data/data # Ignore python -*/**/__init__.py */**/__pycache__ python/workspace/* python/workspace.py python/local_config.py + +# Ignore python_v2 +python_v2/workspace/* +python_v2/workspace.py +python_v2/local_config.py diff --git a/python_v2/README.md b/python_v2/README.md new file mode 100644 index 000000000..e2621e6be --- /dev/null +++ b/python_v2/README.md @@ -0,0 +1,52 @@ +# Getting Start + +## If you want to do some data analysis work: +Start your ClickHouse container, which should be set up in [Clickhouse-sample-data](../sample_data/README.md) + +1. Clone OpenDigger `git clone https://github.com/X-lab2017/open-digger.git` + +2. Enter the repo path `cd open-digger` + +3. Go to the `python` folder in the open-digger root directory, create a file named 'local_config.py'(this file has already added into `.gitignore` file.) for Python Kernel with the following contents: + + ```python + local_config = { + 'db': { + 'clickhouse': { + 'host':'172.17.0.1', + 'user':'default' + }, + 'neo4j':{ + 'port': '7687', + } + } + } + ``` + the `host` above is the host of the ClickHouse server. We can find it using `docker inspect containert_name`, and copy the `Gateway` like this: + + ```shell + $ docker inspect container_name | grep Gateway + "Gateway": "172.17.0.1", + "IPv6Gateway": "", + "Gateway": "172.17.0.1", + "IPv6Gateway": "", + ``` + If you use your own data, you can also change `host` field to your own host IP +4. Use `docker build -t opendigger-jupyter-python:1.0 $(pwd)` to make a docker image, this image is based on `miniconda`. You can check the `Dockerfile` in root directory. + + > If you are using **Windows CMD**, all the `$(pwd)` here should be replaced by `%cd%`. And if you are using **Windows Powershell**, all the `$(pwd)` here should be replaced by `${pwd}`. + > + > **Notice:** Pathnames of directories like "pwd" may use `\` to join the directory in some versions of Windows. We recommend using absolute paths. + +5. Then we can use `docker run -it --name python_notebook_name --rm -p 8888:8888 -v $(pwd):/python_kernel/notebook opendigger-jupyter-python:1.0` to create and run the container. + +6. Open the link in console log like `http://127.0.0.1:8888/lab?token=xxxxx`. + +7. If the source code under `python` folder changed, you need to stop the notebook docker using `docker stop python_notebook_name` and restart the notebook kernel using `docker run -it --name python_notebook_name --rm -p 8888:8888 -v $(pwd):/python_kernel/notebook opendigger-jupyter-python:1.0` to reload the sorce code. + +8. You can find the notebook folder, where we provide demos in the handbook. You can create a new file, and happy data exploring! + Attention: you need to do this work in `notebook` or other parallel folder. If you run in root directory, it can't work because of python import rules. + +## If you are a developer: + +You can also make `workspace.py` in `python` folder. and run it. diff --git a/python_v2/config.py b/python_v2/config.py new file mode 100644 index 000000000..f2a10966f --- /dev/null +++ b/python_v2/config.py @@ -0,0 +1,50 @@ +inited = False +config = { + 'general': { + 'owner': 'X-lab2017', + 'repo': 'OpenDigger', + 'baseUrl': 'http://open-digger.opensource-service.cn/', + }, + 'db': { + 'clickhouse': { + 'host': 'localhost', #python里的clickhouse_driver用的tcp端口9000 + 'port': '9000', + 'user': '', + 'password': '', + 'protocol': 'http:', + 'format': 'JSON', + 'database': 'opensource', + }, + 'neo4j': { + 'host':'neo4j://localhost:7687', + } + }, + 'oss': { + 'ali': { + 'region': '', + 'accessKeyId': '', + 'accessKeySecret': '', + 'bucket': '', + } + }, + 'ci': { + 'token':'', + } +} +def mergeConfig(base_config, local_config): + for key, val in local_config.items(): + if isinstance(val, dict): + mergeConfig(base_config[key], val) + else: + base_config[key] = val + return base_config +def getConfig(): + global config + if not inited: + try: + from local_config import local_config + config = mergeConfig(config, local_config) + return config + except: + return config + return config diff --git a/python_v2/db/clickhouse_wrapper.py b/python_v2/db/clickhouse_wrapper.py new file mode 100644 index 000000000..e7e15afcf --- /dev/null +++ b/python_v2/db/clickhouse_wrapper.py @@ -0,0 +1,24 @@ +from easydict import EasyDict +from config import getConfig +from clickhouse_driver import Client + +class ClickhouseWrapper(object): + def __init__(self): + if not hasattr(ClickhouseWrapper, "_first_init"): + config = EasyDict(getConfig()).db.clickhouse + try: + self.client = Client(config.host, config.port, config.database, config.user, config.password) + except : + print("CLICKHOUSE INIT FAILED") + def __new__(cls, *args, **kwargs): + + if not hasattr(ClickhouseWrapper, "_instance" ): + ClickhouseWrapper._instance = object.__new__(cls) + return ClickhouseWrapper._instance + + + def query(self, q): + return self.client.execute(q) + + def queryDataframe(self,q): + return self.client.query_dataframe(q) diff --git a/python_v2/db/neo4j_wrapper.py b/python_v2/db/neo4j_wrapper.py new file mode 100644 index 000000000..39e0dcc2a --- /dev/null +++ b/python_v2/db/neo4j_wrapper.py @@ -0,0 +1,23 @@ +from py2neo import Graph +from easydict import EasyDict +from config import getConfig + +class Neo4jWrapper(object): + def __init__(self): + neo4j_config = EasyDict(getConfig()).db.neo4j + # self.driver = Graph(neo4j_config.host) + try: + self.driver = Graph(neo4j_config.host) + except Exception as e: + print(e) + print("NEO4J INIT ERROR") + + def __new__(cls, *args, **kwargs): + + if not hasattr(Neo4jWrapper, "_instance" ): + Neo4jWrapper._instance = object.__new__(cls) + return Neo4jWrapper._instance + + def query(self, query_sql): + result = self.driver.run(query_sql) # return a cursor object + return result.data() diff --git a/python_v2/label_data_utils.py b/python_v2/label_data_utils.py new file mode 100644 index 000000000..c4899addf --- /dev/null +++ b/python_v2/label_data_utils.py @@ -0,0 +1,175 @@ +import os +import yaml +import platform +from typing import List +labelInputDir = '../labeled_data' +labelInputPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), labelInputDir) + +supportedTypes = set(['Region', 'Company', 'Community', 'Project', 'Foundation','Tech-0', 'Tech-1', 'Tech-2','Domain-0', 'Bot']) + +supportedKey = set(['label', 'github_repo', 'github_org', 'github_user']) +GitHubData = { + 'githubRepos': [], + 'githubOrgs': [], + 'githubUsers': [], +} + +emptyData = { + 'githubRepos': [], + 'githubOrgs': [], + 'githubUsers': [], +} + +LabelItem = { + 'identifier': '', + 'content': { + 'name': '', + 'type': '', + 'data': '', + }, + 'parsed': True +} +LabelItem.update(GitHubData) + +ParsedLabelItem = { + 'identifier': '', + 'type': '', + 'name': '' +} +ParsedLabelItem.update(GitHubData) + +def getLabelData(): + if not os.path.isdir(labelInputPath): + print('{} input path is not a directory.'.format(labelInputPath)) + return [] + labelMap = {} #() + indexFileName = '{}index.yml'.format(os.path.sep) + labelFileSuffix = '.yml' + def getfileProcessor(f): + if not f.endswith('.yml'): return + # convert windows favor path to linux favor path + + identifier = processLabelIdentifier(':{}'.format(f[0:f.find(indexFileName)] if f.endswith(indexFileName) else f[0:f.find(labelFileSuffix)])) + content = open(os.path.join(labelInputPath, f),encoding='utf-8').read() + content = yaml.load(content,Loader=yaml.FullLoader) + labelMap[identifier] = { + 'identifier':identifier, + 'content':content, + 'parsed': False, + 'githubOrgs': [], + 'githubRepos': [], + 'githubUsers': [], + } + + readPath(labelInputPath, '', getfileProcessor) + data = processLabelItems(labelMap) + return data + +def readPath(p, base, fileProcessor): + """_summary_ + + Args: + p (string): _description_ + base (string): _description_ + fileProcessor(f:string)->void. + """ + if not os.path.isdir(p): + fileProcessor(base) + else: + for f in os.listdir(p): + readPath(os.path.join(p, f), os.path.join(base, f), fileProcessor) + +def processLabelItems(map_item)->List: + """_summary_ + + Args: + map_item (Map): _description_ + LabelItem (_type_): _description_ + + Returns: + ParsedLabelItem[]: _description_ + """ + for item in map_item.values(): + parseItem(item, map_item) + return list(map(lambda item: {'identifier': item.get('identifier'), + 'type': item.get('content').get('type'), + 'name': item.get('content').get('name'), + 'githubRepos': list(set(item.get('githubRepos'))), + 'githubOrgs': list(set(item.get('githubOrgs'))), + 'githubUsers': list(set(item.get('githubUsers'))), + }, list(map_item.values()))) + +def parseItem(item, map_item): + """_summary_ + + Args: + item (LabelItem): _description_ + map_item (Map): _description_ + """ + if item.get('parsed'): return + if item.get('content').get('type') and item.get('content').get('type') not in supportedTypes: + raise Exception('Not supported type {}'.format(item.get('content').get('type'))) + for key in item.get('content').get('data'): + if not key in supportedKey: + raise Exception('Not supported element={}, identifier={}').format(key, item.get('identifier')) + if key == 'github_repo': + item.get('githubRepos').extend(x for x in item.get('content').get('data')[key]) + elif key == 'github_org': + item.get('githubOrgs').extend(x for x in item.get('content').get('data')[key]) + elif key == 'github_user': + item.get('githubUsers').extend(x for x in item.get('content').get('data')[key]) + elif key == 'label': + labels = item.get('content').get('data')[key] + for label in labels: + identifier = label if label.startswith(':') else processLabelIdentifier(os.path.join(item.get('identifier'), label)) + innerItem = map_item.get(identifier) + if innerItem == None: + raise Exception('Can not find nest identifier {} for {}'.format(identifier, item.get('identifier'))) + if not innerItem.get('parsed'): + parseItem(innerItem, map_item) + item.get('githubOrgs').extend(x for x in innerItem.get('githubOrgs')) + item.get('githubRepos').extend(x for x in innerItem.get('githubRepos')) + item.get('githubUsers').extend(x for x in innerItem.get('githubUsers')) + item['parsed'] = True + +def processLabelIdentifier(identifier: str)-> str: + if platform.system() == 'Windows': + return os.path.altsep.join(identifier.split(os.path.sep)) + else: return identifier + +def labelDataToGitHubData(data)->GitHubData: + """_summary_ + + Args: + data (list of ParsedLabelItem): _description_ + + Returns: + GitHubData: _description_ + """ + repoSet = set([]) + orgSet = set([]) + userSet = set([]) + for item in data: + for r in item.get('githubRepos'): repoSet.add(r) + for o in item.get('githubOrgs'): orgSet.add(o) + for u in item.get('githubUsers'): userSet.add(u) + return { + "githubRepos": list(repoSet), + "githubOrgs": list(orgSet), + "githubUsers": list(userSet), + } + +def getGitHubData(typeOrIds: List)-> GitHubData: + """_summary_ + + Args: + typeOrIds (List): _description_ + + Returns: + GitHubData: _description_ + """ + if len(typeOrIds) == 0: return emptyData + data = getLabelData() + if data == None: return emptyData + arr = list(filter(lambda i: i.get('type') in typeOrIds or i.get('identifier') in typeOrIds, data)) + return labelDataToGitHubData(arr) diff --git a/python_v2/metrics/__init__.py b/python_v2/metrics/__init__.py new file mode 100644 index 000000000..bfc434d45 --- /dev/null +++ b/python_v2/metrics/__init__.py @@ -0,0 +1,4 @@ +class Metric(object): + from metrics.chaoss import Chaoss as chaoss + from metrics.index import Index as index + from metrics.related_users import Relation as relation diff --git a/python_v2/metrics/basic.py b/python_v2/metrics/basic.py new file mode 100644 index 000000000..87ff913d3 --- /dev/null +++ b/python_v2/metrics/basic.py @@ -0,0 +1,345 @@ +from itertools import groupby +import db.clickhouse_wrapper as clickhouse_wrapper +from numpy import append +from label_data_utils import getGitHubData, getLabelData +import datetime +from easydict import EasyDict +import math + +QueryConfig = { + 'labelUnion': None, + 'labelIntersect': None, + 'repoIds': None, + 'orgIds': None, + 'repoNames': None, + 'orgNames': None, + 'userIds': None, + 'userLogins': None, + 'startYear': 2015, + 'startMonth': 1, + 'endYear': 2015, + 'endMonth': 12, + 'order': 'DESC', + 'limit': 10, + 'precision': 2, + 'groupBy': None, + 'groupTimeRange': None, + 'options': None +} + +def getMergedConfig(config): + defaultConfig = { + 'startYear': 2015, + 'startMonth': 1, + 'endYear': datetime.datetime.today().year, + 'endMonth': datetime.datetime.today().month, + 'orderOption': 'latest', + 'order': 'DESC', + 'limit': 10, + 'limitOption': 'all', + 'precision': 2, + } + defaultConfig.update(config) + return defaultConfig + + +def forEveryMonthByConfig(config, func): + return forEveryMonth(config.get('startYear'), config.get('startMonth'), config.get('endYear'), config.get('endMonth'), func) + +def forEveryMonth(startYear, startMonth, endYear, endMonth, func): + for y in range(startYear, endYear + 1): + begin_month = startMonth if y == startYear else 1 + end_month = endMonth if y == endYear else 12 + for m in range(begin_month, end_month + 1): + func(y, m) + +# Repo +def getRepoWhereClauseForNeo4j(config): + def process(l): + data = getGitHubData([l]) + data = EasyDict(data) + arr = [] + if len(data.githubRepos) > 0: arr.append('r.id IN {}'.format(data.githubRepos)) + if len(data.githubOrgs) > 0: arr.append('r.org_id IN {}'.format(data.githubOrgs)) + if len(arr) == 0: return None + return '({})'.format(' OR '.join(arr)) + repoWhereClauseArray = [] + if config.get('repoIds'): repoWhereClauseArray.append('r.id IN {}'.format(config.get('repoIds'))) + if config.get('repoNames'): repoWhereClauseArray.append('r.name IN {}'.format(config.get('repoNames'))) + if config.get('orgIds'): repoWhereClauseArray.append('r.org_id IN {}'.format(config.get('orgIds'))) + if config.get('orgNames'): repoWhereClauseArray.append('r.org_name IN {}'.format(config.get('orgNames'))) + if config.get('labelIntersect'): + return '(' + ' AND '.join(list(filter(lambda i: i != None, list(map(process, config.get('labelIntersect')))))) + ')' + if config.get('labelUnion'): + data = EasyDict(getGitHubData(config.get('labelUnion'))) + if len(data.githubRepos > 0): repoWhereClauseArray.append('r.id IN {}'.format(data.githubRepos)) + if len(data.githubOrgs > 0): repoWhereClauseArray.append('r.org_id IN {}'.format(data.githubOrgs)) + repoWhereClause = '({})'.format(' OR '.join(repoWhereClauseArray)) if len(repoWhereClauseArray) > 0 else None + return repoWhereClause + +def getRepoWhereClauseForClickhouse(config): + def process(l): + data = getGitHubData([l]) + data = EasyDict(data) + arr = [] + if len(data.githubRepos) > 0: arr.append('repo_id IN {}'.format(data.githubRepos)) + if len(data.githubOrgs) > 0: arr.append('org_id IN {}'.format(data.githubOrgs)) + if len(arr) == 0: return None + return '({})'.format(' OR '.join(arr)) + repoWhereClauseArray = [] + if config.get('repoIds'): repoWhereClauseArray.append('repo_id IN {}'.format(config.get('repoIds'))) + if config.get('repoNames'): + # find id first + sql = 'SELECT DISTINCT(repo_id) FROM opensource.gh_events WHERE repo_name IN {}'.format(config.get('repoNames')) + ids = clickhouse_wrapper.query(sql) + repoWhereClauseArray.append('repo_id IN {}'.format(list(map(lambda i: i[0], ids)))) + if config.get('orgIds'): repoWhereClauseArray.append('org_id IN {}'.format(config.get('orgIds'))) + if config.get('orgNames'): + # find id first + sql = 'SELECT DISTINCT(org_id) FROM opensource.gh_events WHERE org_login IN {}'.format(config.get('orgNames')) + ids = clickhouse_wrapper.query(sql) + repoWhereClauseArray.append('org_id IN {}'.format(list(map(lambda i: i[0], ids)))) + if config.get('labelIntersect'): + return '(' + ' AND '.join(list(filter(lambda i: i != None, list(map(process, config.get('labelIntersect')))))) + ')' + if config.get('labelUnion'): + data = EasyDict(getGitHubData(config.get('labelUnion'))) + if len(data.githubRepos > 0): repoWhereClauseArray.append('repo_idIN {}'.format(data.githubRepos)) + if len(data.githubOrgs > 0): repoWhereClauseArray.append('org_id IN {}'.format(data.githubOrgs)) + repoWhereClause = '({})'.format(' OR '.join(repoWhereClauseArray)) if len(repoWhereClauseArray) > 0 else None + return repoWhereClause + +# User +def getUserWhereClauseForNeo4j(config): + def process(l): + data = getGitHubData([l]) + data = EasyDict(data) + if len(data.githubUsers) > 0: return 'u.id IN {}'.format(data.githubUsers) + return None + userWhereClauseArray = [] + if config.get('userIds'): userWhereClauseArray.append('u.id IN {}'.format(config.get('userIds'))) + if config.get('userLogins'): userWhereClauseArray.append('u.login IN {}'.format(config.get('userLogins'))) + if config.get('labelIntersect'): + return '(' + ' AND '.join(list(filter(lambda i: i != None, list(map(process, config.get('labelIntersect')))))) + ')' + if config.get('labelUnion'): + data = EasyDict(getGitHubData(config.get('labelUnion'))) + if len(data.githubUsers > 0): userWhereClauseArray.append('u.id IN {}'.format(data.githubUsers)) + userWhereClause = '({})'.format(' OR '.join(userWhereClauseArray)) if len(userWhereClauseArray) > 0 else None + return userWhereClause + +def getUserWhereClauseForClickhouse(config): + def process(l): + data = getGitHubData([l]) + data = EasyDict(data) + if len(data.githubUsers) > 0: return 'actor_id IN {}'.format(data.githubUsers) + return None + userWhereClauseArray = [] + if config.get('userIds'): userWhereClauseArray.append('actor_id IN {}'.format(config.get('userIds'))) + if config.get('userLogins'): + # get id first + sql = 'SELECT DISTINCT(actor_id) FROM opensource.gh_events WHERE actor_login IN {}'.format(config.get('userLogins')) + ids = clickhouse_wrapper.query(sql) + userWhereClauseArray.append('actor_id IN {}'.format(list(map(lambda i: i[0], ids)))) + if config.get('labelIntersect'): + return '(' + ' AND '.join(list(filter(lambda i: i != None, list(map(process, config.get('labelIntersect')))))) + ')' + if config.get('labelUnion'): + data = EasyDict(getGitHubData(config.get('labelUnion'))) + if len(data.githubRepos > 0): userWhereClauseArray.append('actor_id IN {}'.format(data.githubUsers)) + userWhereClause = '({})'.format(' OR '.join(userWhereClauseArray)) if len(userWhereClauseArray) > 0 else None + return userWhereClause + +# Time +def getTimeRangeWhereClauseForNeo4j(config, type): + timeWhereClauseArray = [] + forEveryMonthByConfig(config, lambda y, m: timeWhereClauseArray.append('{}.activity_{}{} > 0'.format(type, y, m))) + if len(timeWhereClauseArray) == 0: raise Exception('Not valid time range.') + timeWhereClause = '({})'.format(' OR '.join(timeWhereClauseArray)) + return timeWhereClause + +def getTimeRangeSumClauseForNeo4j(config, type): + lastYear = 0 + lastQuarter = 0 + def process_quarter(y, m): + nonlocal lastQuarter + q = math.ceil(m / 3) + if q != lastQuarter: timeRangeSumClauseArray.append([]) + timeRangeSumClauseArray[len(timeRangeSumClauseArray) - 1].append('COALESCE({}_{}{}, 0.0)'.format(type, y, m)) + lastQuarter = q + def process_year(y, m): + nonlocal lastYear + if y != lastYear: timeRangeSumClauseArray.append([]) + timeRangeSumClauseArray[len(timeRangeSumClauseArray) - 1].append('COALESCE({}_{}{}, 0.0)'.format(type, y, m)) + lastYear = y + timeRangeSumClauseArray = [] + if config.get('groupTimeRange') == 'month': + # for every month individual, every element belongs to a individual element + forEveryMonthByConfig(config, lambda y, m: timeRangeSumClauseArray.append(['COALESCE({}_{}{}, 0.0)'.format(type, y, m)])) + elif config.get('groupTimeRange') == 'quarter': + # for every quarter, need to find out when to push a new element by quarter + forEveryMonthByConfig(config, process_quarter) + elif config.get('groupTimeRange') == 'year': + # for every year, need to find out when to push a new element by the year; + forEveryMonthByConfig(config, process_year) + else: + # for all to single one, push to the first element + timeRangeSumClauseArray.push([]) + forEveryMonthByConfig(config, lambda y, m: timeRangeSumClauseArray[0].append('COALESCE({}_{}{}, 0.0)'.format(type, y, m))) + if len(timeRangeSumClauseArray) == 0: raise Exception('Not valid time range.') + timeRangeSumClause = list(map(lambda i: 'round({}, {})'.format(' + '.join(i), config.get('percision')), timeRangeSumClauseArray)) + return timeRangeSumClause + +def getTimeRangeWhereClauseForClickhouse(config): + endDate = datetime.date(year = config.get('endYear')+1 if config.get('endMonth')+1>12 else config.get('endYear'), month = (config.get('endMonth')+1)%12, day = 1) + # endDate.setMonth(config.get('endMonth')) # find next month + return ' created_at >= toDate(\'{}-{}-1\') AND created_at < toDate(\'{}-{}-1\') '.format(config.get('startYear'), config.get('startMonth'), endDate.year, endDate.month) + +# clickhouse label group condition +def getLabelGroupConditionClauseForClickhouse(config): + labelData = list(filter(lambda l: l.get('type') == config.get('groupBy'), getLabelData())) if getLabelData() != None else None + if (labelData==None or len(labelData) == 0): raise Exception('Invalide group by label: {}'.format(config.get('groupBy'))) + idLabelRepoMap = {} + idLabelOrgMap = {} + idLabelUserMap = {} + def addToMap(my_map, id, label): + if not id in my_map: my_map[id] = [] + if my_map.get(id) != None: my_map.get(id).append(label) + + for l in labelData: + for id in l.get('githubOrgs'): addToMap(idLabelOrgMap, id, l.get('name')) + for id in l.get('githubRepos'): addToMap(idLabelRepoMap, id, l.get('name')) + for id in l.get('githubUsers'): addToMap(idLabelUserMap, id, l.get('name')) + + resultMap = {} # + def addToResultMap(my_map, id:int, labels:str, type): + """_summary_ + Args: + my_map (dict): dict + id (int): number + labels (str): string list + type (str): 'repo' | 'org' | 'user' + """ + key = str(labels) + if not key in my_map: my_map[key] = { 'labels':labels, 'repoIds': [], 'orgIds': [], 'userIds': [] } + if type == 'repo': + if my_map.get(key) != None: my_map.get(key).get('repoIds').append(id) + elif type == 'org': + if my_map.get(key) != None: my_map.get(key).get('orgIds').append(id) + elif type == 'user': + if my_map.get(key) != None: my_map.get(key).get('userIds').append(id) + + for id, labels in idLabelRepoMap.items(): addToResultMap(resultMap, id, labels, 'repo') + for id, labels in idLabelOrgMap.items(): addToResultMap(resultMap, id, labels, 'org') + for id, labels in idLabelUserMap.items(): addToResultMap(resultMap, id, labels, 'user') + + def process(v): + c = [] + if len(v.get('orgIds')) > 0: c.append('org_id IN ({})'.format(','.join(str(i) for i in v.get('orgIds')))) + if len(v.get('repoIds')) > 0: c.append('repo_id IN ({})'.format(','.join(str(i) for i in v.get('repoIds')))) + if len(v.get('userIds')) > 0: c.append('actor_id IN ({})'.format(','.join(str(i) for i in v.get('userIds')))) + return '({}),[{}]'.format(' OR '.join(c), ','.join(map(lambda l: '\'{}\''.format(l),v.get('labels')))) + conditions = ','.join(list(map(process, resultMap.values()))) + + return 'arrayJoin(multiIf({}, [\'Others\']))'.format(conditions) + +def getGroupArrayInsertAtClauseForClickhouse(config, option): + """_summary_ + Args: + config (dict): QueryConfig + option (_type_): { key: string; defaultValue?: string; value?: string; } + """ + start_time = f"toDate('{config['startYear']}-{config['startMonth']}-1')" + end_time = f"toDate('{config['endYear']}-{config['endMonth']}-1')" + + default_value = option.get('defaultValue', 0) + + total_length = "" + if config.get('groupTimeRange'): + total_length = f"toUInt32(dateDiff('{config['groupTimeRange']}', {start_time}, {end_time})) + 1" + else: + total_length = "1" + + fieldName = option.get('value', option['key']) + if config['precision'] > 0 and not option.get('noPrecision'): + group_key = f"ROUND({fieldName}, {config['precision']})" + else: + group_key = fieldName + + if not config.get('groupTimeRange'): + position = "0" + else: + if config['groupTimeRange'] == 'quarter': + start_time = f"toStartOfQuarter({start_time})" + elif config['groupTimeRange'] == 'year': + start_time = f"toStartOfYear({start_time})" + position = f"toUInt32(dateDiff('{config['groupTimeRange']}', {start_time}, time){'-1' if option.get('positionByEndTime') else ''})" + + return f'''groupArrayInsertAt( + {default_value}, + {total_length})({group_key}, + {position}) AS {option['key']}''' + +def getGroupTimeClauseForClickhouse(config, timeCol = 'created_at') -> str: + """_summary_ + Args: + config (_type_): _description_ + timeCol (str, optional): _description_. Defaults to 'created_at'. + + Returns: + str: _description_ + """ + groupEle = '1' # no time range, aggregate all data to a single value + if config.get('groupTimeRange') == 'month': groupEle = 'toStartOfMonth({})'.format(timeCol) + elif config.get('groupTimeRange') == 'quarter': groupEle = 'toStartOfQuarter({})'.format(timeCol) + elif config.get('groupTimeRange') == 'year': groupEle = 'toStartOfYear({})'.format(timeCol) + return '{} AS time'.format(groupEle) + +def getGroupIdClauseForClickhouse(config, type = 'repo', timeCol = 'created_at') -> str: + """_summary_ + Args: + config (_type_): _description_ + type (str, optional): _description_. Defaults to 'repo'. + timeCol (str, optional): _description_. Defaults to 'created_at'. + + Returns: + str: _description_ + """ + if config.get('groupBy') == None: #group by repo' + if type == 'repo': + return 'repo_id AS id, argMax(repo_name, time) AS name' + else: + return 'actor_id AS id, argMax(actor_login, time) AS name' + elif config.get('groupBy') == 'org': + return 'org_id AS id, argMax(org_login, time) AS name' + else : # group by label + return '{} AS id, id AS name'.format(getLabelGroupConditionClauseForClickhouse(config)) + +def getInnerOrderAndLimit(config, col, index=None): + if config.get('limitOption') == 'each' and config.get('limit', 0) > 0: + order_by_clause = f"ORDER BY {col}[{index}] {config.get('order')}" if config.get('order') else '' + limit_clause = f"LIMIT {config.get('limit')} BY time" + return f"{order_by_clause} {limit_clause}" + else: + return '' + +def getOutterOrderAndLimit(config, col, index=None): + order_clause = "" + if config.get('order'): + if config.get('orderOption') == 'latest': + order_clause = f"ORDER BY {col}[-1]{f'[{index}]' if index is not None else ''}" + else: + index_clause = f"x -> x[{index}], " if index is not None else '' + order_clause = f"ORDER BY arraySum({index_clause}{col})" + limit_clause = f"LIMIT {config.get('limit')}" if config.get('limitOption') == 'all' and config.get('limit', 0) > 0 else '' + return f"{order_clause} {config.get('order', '')} {limit_clause}" + +def filterEnumType(value, types, defautlValue: str) -> str: + """_summary_ + Args: + value (_type_): _description_ + types (str list): _description_ + defautlValue (str): _description_ + + Returns: + str: _description_ + """ + if not value or not value in types: return defautlValue + return value diff --git a/python_v2/metrics/chaoss.py b/python_v2/metrics/chaoss.py new file mode 100644 index 000000000..63dcbf4c5 --- /dev/null +++ b/python_v2/metrics/chaoss.py @@ -0,0 +1,795 @@ +import datetime +from typing import Tuple,List +from .basic import filterEnumType,\ + getGroupArrayInsertAtClauseForClickhouse,\ + getGroupTimeClauseForClickhouse,\ + getGroupIdClauseForClickhouse,\ + getMergedConfig,\ + getRepoWhereClauseForClickhouse,\ + getTimeRangeWhereClauseForClickhouse,\ + getInnerOrderAndLimit,\ + getOutterOrderAndLimit,\ + getUserWhereClauseForClickhouse,\ + QueryConfig +from db.clickhouse_wrapper import ClickhouseWrapper +clickhouse = ClickhouseWrapper() + +class Chaoss(): + __ISSUE_COMMENT_WEIGHT = 1 + __OPEN_ISSUE_WEIGHT = 2 + __OPEN_PULL_WEIGHT = 3 + __REVIEW_COMMENT_WEIGHT = 4 + __PULL_MERGED_WEIGHT = 2 + __basicActivitySqlComponent = f''' + if(type=\'PullRequestEvent\' AND action=\'closed\' AND pull_merged=1, issue_author_id, actor_id) AS actor_id, + argMax(if(type=\'PullRequestEvent\' AND action=\'closed\' AND pull_merged=1, issue_author_login, actor_login), created_at) AS actor_login, + countIf(type=\'IssueCommentEvent\' AND action=\'created\') AS issue_comment, + countIf(type=\'IssuesEvent\' AND action=\'opened\') AS open_issue, + countIf(type=\'PullRequestEvent\' AND action=\'opened\') AS open_pull, + countIf(type=\'PullRequestReviewCommentEvent\' AND action=\'created\') AS review_comment, + countIf(type=\'PullRequestEvent\' AND action=\'closed\' AND pull_merged=1) AS merged_pull, + sqrt({__ISSUE_COMMENT_WEIGHT}*issue_comment + {__OPEN_ISSUE_WEIGHT}*open_issue + {__OPEN_PULL_WEIGHT}*open_pull + {__REVIEW_COMMENT_WEIGHT}*review_comment + {__PULL_MERGED_WEIGHT}*merged_pull) AS activity + ''' + + CodeChangeCommitsOptions= { + # a filter regular expression for commit message + 'messageFilter': '^(build:|chore:|ci:|docs:|feat:|fix:|perf:|refactor:|revert:|style:|test:).*' + } + + timeDurationConstants = { + "unitArray": ['week', 'day', 'hour', 'minute'], + "sortByArray": ['avg', 'levels', 'quantile_0', 'quantile_1', 'quantile_2', 'quantile_3', 'quantile_4'], + "quantileArray": list(range(5)), + } + + def __bulidInnnerCountSql(config, whereClauses, type='repo'): + return f''' + SELECT + {getGroupTimeClauseForClickhouse(config)}, + {getGroupIdClauseForClickhouse(config, type)}, + COUNT() AS count + FROM opensource.gh_events + WHERE {' AND '.join(whereClauses)} + GROUP BY id, time + {getInnerOrderAndLimit(config, 'count')} + ''' + + def __bulidOuterCountSql(config, inner_sql, countColName): + return f''' + SELECT + id, + argMax(name, time) AS name, + SUM(count) AS total_count, + {getGroupArrayInsertAtClauseForClickhouse(config, { 'key': countColName, 'value':'count' })} + FROM + ({inner_sql}) + GROUP BY id + {getOutterOrderAndLimit(config, countColName)} + ''' + + def __executeInnnerSql(inner_sql, columns=['time','id','name','count']): + queryResult = clickhouse.query(inner_sql) + rst = list(map(lambda row: dict(zip(columns,row)), queryResult)) + return rst + + def __executeOuterSql(generated_sql, columns, processMethod): + queryResult = clickhouse.query(generated_sql) + rst = [processMethod(row, columns) for row in queryResult] + # rst = list(map(lambda row: dict(zip(columns,row)), queryResult)) + return rst + + def __process(row, cloumns): + processResult = dict(zip(cloumns,row)) + return processResult + + def __processAppendRatio(row, cloumns, countIndex = -1, totalCountIndex = -2): + processResult = dict(zip(cloumns,row)) + count = row[countIndex] + total_count = row[totalCountIndex] + processResult['ratio'] = list(map(lambda v: '{}%'.format(str(format((v*100/total_count), '.2f'))), count)) + return processResult + + def chaossCodeChangeCommits(config, mode='outer') -> (List,str): + """_summary_ + + Args: + config (QueryConfig): _description_ + """ + config = getMergedConfig(config) + whereClauses = ["type = \'PushEvent\' "] + repoWhereClause = getRepoWhereClauseForClickhouse(config) + if repoWhereClause != None: whereClauses.append(repoWhereClause) + whereClauses.append(getTimeRangeWhereClauseForClickhouse(config)) + if config.get('options') and config.get('options').get('messageFilter'): + arrayJoinMessage = 'arrayFilter(x -> match(x, \'{}\'), push_commits.message)'.format(config.get('options').get('messageFilter')) + else: + arrayJoinMessage = 'push_commits.message' + + rst = [] + inner_sql = f''' + SELECT + {getGroupTimeClauseForClickhouse(config)}, + {getGroupIdClauseForClickhouse(config, 'repo')}, + COUNT(arrayJoin({arrayJoinMessage})) AS count + FROM opensource.gh_events + WHERE {' AND '.join(whereClauses)} + GROUP BY id, time + {getInnerOrderAndLimit(config, 'count')} + ''' + + if mode == 'origin': + rst = Chaoss.__executeInnnerSql(inner_sql) + return rst, inner_sql + + generated_sql = Chaoss.__bulidOuterCountSql(config, inner_sql, 'commits_count') + columns = ['id', 'name', 'total_count', 'count'] + rst = Chaoss.__executeOuterSql(generated_sql, columns, Chaoss.__process) + return rst, generated_sql + + def __chaossCount(config, mode, whereClauses, countColName) -> (List,str): + config = getMergedConfig(config) + repoWhereClause = getRepoWhereClauseForClickhouse(config) + if repoWhereClause != None: whereClauses.append(repoWhereClause) + whereClauses.append(getTimeRangeWhereClauseForClickhouse(config)) + rst = [] + inner_sql = Chaoss.__bulidInnnerCountSql(config, whereClauses) + + if mode == 'origin': + rst = Chaoss.__executeInnnerSql(inner_sql) + return rst, inner_sql + generated_sql = Chaoss.__bulidOuterCountSql(config, inner_sql, countColName) + columns = ['id', 'name', 'total_count', 'count', 'ratio'] + rst = Chaoss.__executeOuterSql(generated_sql, columns, Chaoss.__processAppendRatio) + return rst, generated_sql + + def chaossIssuesNew(config, mode='outer') -> (List,str): + """_summary_ + + Args: + config (dict): QueryConfig + """ + whereClauses = ["type = \'IssuesEvent\' AND action IN (\'opened\', \'reopened\')"] + return Chaoss.__chaossCount(config, mode, whereClauses, 'issues_new_count') + + def chaossIssuesClosed(config, mode='outer') -> (List,str): + """_summary_ + + Args: + config (QueryConfig): _description_ + """ + whereClauses = ["type = \'IssuesEvent\' AND action = \'closed\'"] + return Chaoss.__chaossCount(config, mode, whereClauses, 'issues_close_count') + + def chaossChangeRequestsAccepted(config: QueryConfig, mode='outer') -> (List,str): + """_summary_ + + Args: + config (QueryConfig): _description_ + """ + whereClauses = ["type = \'PullRequestEvent\' AND action = \'closed\' AND pull_merged = 1"] + return Chaoss.__chaossCount(config, mode, whereClauses, 'change_requests_accepted') + + def chaossChangeRequestsDeclined(config: QueryConfig, mode='outer') -> (List,str): + """_summary_ + + Args: + config (QueryConfig): _description_ + """ + whereClauses = ["type = \'PullRequestEvent\' AND action = \'closed\' AND pull_merged = 0"] + return Chaoss.__chaossCount(config, mode, whereClauses, 'change_requests_declined') + + BusFactorOptions = { + # calculate bus factor by change request or git commit, or activity index. default: activity ('commit' | 'change request' | 'activity') + 'by': 'activity', + # the bus factor percentage thredhold, default: 0.5 + 'percentage': 0.5, + # include GitHub Apps account, default: false + 'withBot': False, + } + + def chaossBusFactor(config, mode='outer') -> (List,str): + """_summary_ + + Args: + config (QueryConfig): QueryConfig + + Returns: + _type_: _description_ + """ + config = getMergedConfig(config) + by = filterEnumType(config.get('options').get('by') if config.get('options') != None else None, ['commit', 'change request', 'activity'], 'activity') + whereClauses = [] + if by == 'commit': + whereClauses.append("type = \'PushEvent\'") + elif by == 'change request': + whereClauses.append("type = \'PullRequestEvent\' AND action = \'closed\' AND pull_merged = 1") + elif by == 'activity': + whereClauses.append("type IN (\'IssuesEvent\', \'IssueCommentEvent\', \'PullRequestEvent\', \'PullRequestReviewCommentEvent\')") + repoWhereClause = getRepoWhereClauseForClickhouse(config) + if repoWhereClause != None: whereClauses.append(repoWhereClause) + whereClauses.append(getTimeRangeWhereClauseForClickhouse(config)) + + percentage = str(1 - config.get('options').get('percentage')) if config.get('options') and 'percentage' in config.get('options') else '0.5' + + authorFieldName = 'actor_login' if by == 'activity' else 'author' + if config.get('options', {}).get('withBot') and by != 'commit': + botFilterHavingClause = "" + else: + botFilterHavingClause = f"HAVING {authorFieldName} NOT LIKE '%[bot]'" + + rst = [] + inner_sql = f''' + SELECT + time, + id, + any(name) AS name, + SUM(count) AS total_contributions, + length(detail) AS bus_factor, + arrayFilter(x -> tupleElement(x, 2) >= quantileExactWeighted({percentage}) (count, count), arrayMap((x, y) -> (x, y), groupArray({authorFieldName}), groupArray(count))) AS detail + FROM + ( + SELECT + {getGroupTimeClauseForClickhouse(config)}, + {getGroupIdClauseForClickhouse(config)}, + { + 'arrayJoin(push_commits.name) AS author, COUNT() AS count' if by == 'commit' else + 'issue_author_id AS actor_id, argMax(issue_author_login, created_at) AS author, COUNT() AS count' if by == 'change request' else + f'{Chaoss.__basicActivitySqlComponent}, toUInt32(ceil(activity)) AS count' + } + FROM opensource.gh_events + WHERE {' AND '.join(whereClauses)} + GROUP BY id, time, {('author' if by == 'commit' else 'actor_id')} + {botFilterHavingClause} + ) + GROUP BY id, time + {getInnerOrderAndLimit(config, 'bus_factor')} + ''' + if mode == 'origin': + columns = ['time', 'id', 'name', 'total_contributions', 'bus_factor', 'detail'] + rst = Chaoss.__executeInnnerSql(inner_sql, columns) + return rst, inner_sql + + generated_sql = f''' + SELECT + id, + argMax(name, time) AS name, + {getGroupArrayInsertAtClauseForClickhouse(config, {"key": "bus_factor"})}, + {getGroupArrayInsertAtClauseForClickhouse(config, {"key": "detail", "noPrecision": True, "defaultValue": "[]"})}, + {getGroupArrayInsertAtClauseForClickhouse(config, {"key": "total_contributions"})} + FROM + ({inner_sql}) + GROUP BY id + {getOutterOrderAndLimit(config, 'bus_factor')} + ''' + columns = ['id', 'name', 'bus_factor', 'detail', 'total_contributions'] + rst = Chaoss.__executeOuterSql(generated_sql, columns, Chaoss.__process) + return rst, generated_sql + + IssueResolutionDurationOptions = { + 'by': 'open', #'open' | 'close' + 'type': 'avg', #'avg' | 'median' + 'unit': 'week' #'week' | 'day' | 'hour' | 'minute' + } + + def __chaossResolutionDuration(config, type, mode) -> (List,str): + """_summary_ + + Args: + config (QueryConfig): _description_ + """ + config = getMergedConfig(config) + whereClauses = ["type = 'IssuesEvent'"] if type == 'issue' else ["type = 'PullRequestEvent'"] + repoWhereClause = getRepoWhereClauseForClickhouse(config) + if repoWhereClause: whereClauses.append(repoWhereClause) + + endDate = datetime.date(year = config.get('endYear')+1 if config.get('endMonth')+1>12 else config.get('endYear'), month = (config.get('endMonth')+1)%12, day = 1) + + by = filterEnumType(config.get("options", {}).get("by"), ['open', 'close'], 'open') + byCol = 'opened_at' if by == 'open' else 'closed_at' + unit = filterEnumType(config.get("options", {}).get("unit"), Chaoss.timeDurationConstants["unitArray"], 'day') + thresholds = config.get("options", {}).get("thresholds", [3, 7, 15]) + ranges = thresholds + [-1] + sortBy = filterEnumType(config.get("options", {}).get("sortBy"), Chaoss.timeDurationConstants["sortByArray"], 'avg') + + rst = [] + inner_sql = f''' + SELECT + {getGroupTimeClauseForClickhouse(config, byCol)}, + {getGroupIdClauseForClickhouse(config, 'repo')}, + avg(resolution_duration) AS avg, + {', '.join([f'quantile({q / 4})(resolution_duration) AS quantile_{q}' for q in Chaoss.timeDurationConstants["quantileArray"]])}, + [{', '.join([f'countIf(resolution_level = {i})' for i in range(len(ranges))])}] AS resolution_levels + FROM + ( + SELECT + repo_id, + argMax(repo_name, created_at) AS repo_name, + org_id, + argMax(org_login, created_at) AS org_login, + issue_number, + argMaxIf(action, created_at, action IN ('opened', 'closed' , 'reopened')) AS last_action, + argMax(issue_created_at, created_at) AS opened_at, + maxIf(created_at, action = 'closed') AS closed_at, + dateDiff('{unit}', opened_at, closed_at) AS resolution_duration, + multiIf({', '.join([f'resolution_duration <= {t}, {i}' for i, t in enumerate(thresholds)])}, {len(thresholds)}) AS resolution_level + FROM opensource.gh_events + WHERE {' AND '.join(whereClauses)} + GROUP BY repo_id, org_id, issue_number + HAVING {byCol} >= toDate('{config['startYear']}-{config['startMonth']}-1') AND {byCol} < toDate('{endDate.year}-{endDate.month}-1') AND last_action='closed' + ) + GROUP BY id, time + {getInnerOrderAndLimit(config, 'resolution_duration')} + ''' + if mode == 'origin': + columns = ['time','id','name','avg', 'quantile_0', 'quantile_1', 'quantile_2', 'quantile_3', 'quantile_4', 'resolution_levels'] + rst = Chaoss.__executeInnnerSql(inner_sql, columns) + return rst, inner_sql + + generated_sql = f''' + SELECT + id, + argMax(name, time) As name, + {getGroupArrayInsertAtClauseForClickhouse(config, { "key": "avg", "defaultValue": 'NaN' })}, + {getGroupArrayInsertAtClauseForClickhouse(config, { "key": 'levels', "value": 'resolution_levels', "defaultValue": "[]", "noPrecision": True })}, + {', '.join([getGroupArrayInsertAtClauseForClickhouse(config, { "key": f"quantile_{q}", "defaultValue": 'NaN' }) for q in Chaoss.timeDurationConstants["quantileArray"]])} + FROM + ({inner_sql}) + GROUP BY id + {getOutterOrderAndLimit(config, sortBy, 1 if sortBy == 'levels' else None)} + ''' + columns = ['id', 'name', 'resolution_duration_avg', 'levels', 'quantile_0', 'quantile_1', 'quantile_2', 'quantile_3', 'quantile_4'] + rst = Chaoss.__executeInnnerSql(generated_sql, columns) + return rst, generated_sql + + def chaossIssueResolutionDuration(config, mode='outer') -> (List,str): + return Chaoss.__chaossResolutionDuration(config, 'issue', mode) + + def chaossChangeRequestResolutionDuration(config, mode='outer') -> (List,str): + return Chaoss.__chaossResolutionDuration(config, 'change request', mode) + + def __chaossResponseTime(config, type, mode) -> (List,str): + config = getMergedConfig(config) + whereClauses = [] + + if type == 'issue': + whereClauses.append("type IN ('IssueCommentEvent', 'IssuesEvent') AND actor_login NOT LIKE '%[bot]'") + else: + whereClauses.append("type IN ('IssueCommentEvent', 'PullRequestEvent', 'PullRequestReviewCommentEvent', 'PullRequestReviewEvent') AND actor_login NOT LIKE '%[bot]'") + + repoWhereClause = getRepoWhereClauseForClickhouse(config) + if repoWhereClause: + whereClauses.append(repoWhereClause) + + endDate = datetime.date(year = config.get('endYear')+1 if config.get('endMonth')+1>12 else config.get('endYear'), month = (config.get('endMonth')+1)%12, day = 1) + unit = filterEnumType(config.get("options", {}).get("unit"), Chaoss.timeDurationConstants["unitArray"], 'day') + thresholds = config.get("options", {}).get("thresholds", [3, 7, 15]) + ranges = thresholds + [-1] + sortBy = filterEnumType(config.get("options", {}).get("sortBy"), Chaoss.timeDurationConstants["sortByArray"], 'avg') + + rst = [] + inner_sql = f''' + SELECT + {getGroupTimeClauseForClickhouse(config, 'issue_created_at')}, + {getGroupIdClauseForClickhouse(config)}, + avg(response_time) AS avg, + {', '.join([f'quantile({q / 4})(response_time) AS quantile_{q}' for q in Chaoss.timeDurationConstants["quantileArray"]])}, + [{', '.join([f'countIf(response_level = {i})' for i in range(len(ranges))])}] AS response_levels + FROM + ( + SELECT + repo_id, + argMax(repo_name, created_at) AS repo_name, + org_id, + argMax(org_login, created_at) AS org_login, + issue_number, + minIf(created_at, action = 'opened' AND issue_comments = 0) AS issue_created_at, + minIf(created_at, (action = 'created' AND actor_id != issue_author_id) OR (action = 'closed')) AS responded_at, + if(responded_at = toDate('1970-01-01'), now(), responded_at) AS first_responded_at, + dateDiff('{unit}', issue_created_at, first_responded_at) AS response_time, + multiIf({', '.join([f'response_time <= {t}, {i}' for i, t in enumerate(thresholds)])}, {len(thresholds)}) AS response_level + FROM opensource.gh_events + WHERE {' AND '.join(whereClauses)} + GROUP BY repo_id, org_id, issue_number + HAVING issue_created_at >= toDate('{config.get('startYear')}-{config.get('startMonth')}-1') + AND issue_created_at < toDate('{endDate.year}-{endDate.month}-1') + ) + GROUP BY id, time + {getInnerOrderAndLimit(config, 'resolution_duration')} + ''' + if mode == 'origin': + columns = ['time','id','name','avg', 'quantile_0', 'quantile_1', 'quantile_2', 'quantile_3', 'quantile_4', 'response_levels'] + rst = Chaoss.__executeInnnerSql(inner_sql,columns) + return rst, inner_sql + + generated_sql = f''' + SELECT + id, + argMax(name, time), + {getGroupArrayInsertAtClauseForClickhouse(config, { "key": "avg", "defaultValue": 'NaN' })}, + {getGroupArrayInsertAtClauseForClickhouse(config, { "key": 'levels', "value": 'response_levels', "defaultValue": "[]", "noPrecision": True })}, + {', '.join([getGroupArrayInsertAtClauseForClickhouse(config, { "key": f"quantile_{q}", "defaultValue": 'NaN' }) for q in Chaoss.timeDurationConstants["quantileArray"]])} + FROM + ({inner_sql}) + GROUP BY id + {getOutterOrderAndLimit(config, sortBy, 1 if sortBy == 'levels' else None)} + ''' + columns = ['id', 'name', 'response_time_avg', 'levels', 'quantile_0', 'quantile_1', 'quantile_2', 'quantile_3', 'quantile_4'] + rst = Chaoss.__executeOuterSql(generated_sql, columns, Chaoss.__process) + return rst, generated_sql + + def chaossIssueResponseTime(config, mode='outer') -> (List,str): + return Chaoss.__chaossResponseTime(config, 'issue', mode) + + def chaossChangeRequestResponseTime(config, mode='outer') -> (List,str): + return Chaoss.__chaossResponseTime(config, 'change request', mode) + + def __chaossAge(config, type, mode) -> (List,str): + config = getMergedConfig(config) + whereClauses = [] + + if type == 'issue': + whereClauses.append("type='IssuesEvent'") + else: + whereClauses.append("type='PullRequestEvent'") + + repoWhereClause = getRepoWhereClauseForClickhouse(config) + if repoWhereClause: + whereClauses.append(repoWhereClause) + + endDate = datetime.date(year = config.get('endYear')+1 if config.get('endMonth')+1>12 else config.get('endYear'), month = (config.get('endMonth')+1)%12, day = 1) + endTimeClause = f"toDate('{endDate.year}-{endDate.month}-1')" + whereClauses.append(f"created_at < {endTimeClause}") + if config['groupTimeRange']: + timeClause = f"arrayJoin(arrayMap(x -> dateAdd({config.get('groupTimeRange')}, x + 1, toDate('{config.get('startYear')}-{config.get('startMonth')}-1')), range(toUInt64(dateDiff('{config.get('groupTimeRange')}', toDate('{config.get('startYear')}-{config.get('startMonth')}-1'), {endTimeClause}))))) AS time" + else: + timeClause = f"{endTimeClause} AS time" + + unit = filterEnumType(config.get("options", {}).get("unit"), Chaoss.timeDurationConstants["unitArray"], 'day') + thresholds = config.get("options", {}).get("thresholds", [15, 30, 60]) + ranges = thresholds + [-1] + sortBy = filterEnumType(config.get("options", {}).get("sortBy"), Chaoss.timeDurationConstants["sortByArray"], 'avg') + + inner_sql = f''' + SELECT + {timeClause}, + {getGroupIdClauseForClickhouse(config)}, + avgIf(dateDiff('{unit}', opened_at, time), opened_at < time AND closed_at >= time) AS avg, + {', '.join([f"quantileIf({q / 4})(dateDiff('{unit}', opened_at, time), opened_at < time AND closed_at >= time) AS quantile_{q}" for q in Chaoss.timeDurationConstants["quantileArray"]])}, + [{', '.join([f"""countIf(multiIf({', '.join([f"dateDiff('{unit}', opened_at, time) <= {t}, {i}" for i, t in enumerate(thresholds)])}, {len(thresholds)}) = {i} AND opened_at < time AND closed_at >= time)""" for i in range(len(ranges))])}] AS age_levels + FROM + ( + SELECT + repo_id, + argMax(repo_name, created_at) AS repo_name, + org_id, + argMax(org_login, created_at) AS org_login, + issue_number, + minIf(created_at, action = 'opened') AS opened_at, + maxIf(created_at, action = 'closed') AS real_closed_at, + if(real_closed_at=toDate('1970-1-1'), {endTimeClause}, real_closed_at) AS closed_at + FROM opensource.gh_events + WHERE {' AND '.join(whereClauses)} + GROUP BY repo_id, org_id, issue_number + HAVING opened_at > toDate('1970-01-01') + ) + GROUP BY id, time + {getInnerOrderAndLimit(config, 'age')} + ''' + if mode == 'origin': + columns = ['time','id','name','avg', 'quantile_0', 'quantile_1', 'quantile_2', 'quantile_3', 'quantile_4', 'age_levels'] + rst = Chaoss.__executeInnnerSql(inner_sql,columns) + return rst, inner_sql + + generated_sql = f''' + SELECT + id, + argMax(name, time), + {getGroupArrayInsertAtClauseForClickhouse(config, { "key": "avg", "defaultValue": 'NaN', "positionByEndTime": True })}, + {getGroupArrayInsertAtClauseForClickhouse(config, { "key": 'levels', "value": 'if(arrayAll(x -> x = 0, age_levels), [], age_levels)', "defaultValue": "[]", "noPrecision": True, "positionByEndTime": True })}, + {', '.join([getGroupArrayInsertAtClauseForClickhouse(config, { "key": f'quantile_{q}', "defaultValue": 'NaN', "positionByEndTime": True}) for q in Chaoss.timeDurationConstants["quantileArray"]])} + FROM + ({inner_sql}) + GROUP BY id + {getOutterOrderAndLimit(config, sortBy, 1 if sortBy == 'levels' else None)} + ''' + columns = ['id', 'name', 'response_time_avg', 'levels', 'quantile_0', 'quantile_1', 'quantile_2', 'quantile_3', 'quantile_4'] + rst = Chaoss.__executeOuterSql(generated_sql, columns, Chaoss.__process) + return rst, generated_sql + + def chaossIssueAge(config, mode='outer') -> (List,str): + return Chaoss.__chaossAge(config, 'issue', mode) + + def chaossChangeRequestAge(config, mode='outer') -> (List,str): + return Chaoss.__chaossAge(config, 'change request', mode) + + #Evolution - Code Development Efficiency + def chaossChangeRequestsAccepted(config, mode='outer') -> (List,str): + whereClauses = ["type = 'PullRequestEvent' AND action = 'closed' AND pull_merged = 1"] + return Chaoss.__chaossCount(config, mode, whereClauses, 'change_requests_accepted') + + def chaossChangeRequestsDeclined(config, mode='outer') -> (List,str): + whereClauses = ["type = 'PullRequestEvent' AND action = 'closed' AND pull_merged = 0"] + return Chaoss.__chaossCount(config, mode, whereClauses, 'change_requests_accepted') + + def chaossChangeRequestsAcceptanceRatio(config, mode='outer') -> (List,str): + config = getMergedConfig(config) + whereClauses = ["type = 'PullRequestEvent' AND action = 'closed' "] + repoWhereClause = getRepoWhereClauseForClickhouse(config) + if repoWhereClause: + whereClauses.append(repoWhereClause) + whereClauses.append(getTimeRangeWhereClauseForClickhouse(config)) + + inner_sql = f''' + SELECT + {getGroupTimeClauseForClickhouse(config)}, + {getGroupIdClauseForClickhouse(config)}, + COUNT() AS count, + countIf(pull_merged = 1) AS accepted_count, + countIf(pull_merged = 0) AS declined_count, + accepted_count / count AS ratio + FROM opensource.gh_events + WHERE {" AND ".join(whereClauses)} + GROUP BY id, time + {getInnerOrderAndLimit(config, 'ratio')} + ''' + if mode == 'origin': + columns = ['time','id','name', 'count', 'accepted_count', 'declined_count', 'ratio'] + rst = Chaoss.__executeInnnerSql(inner_sql,columns) + return rst, inner_sql + + generated_sql = f''' + SELECT + id, + argMax(name, time) AS name, + {getGroupArrayInsertAtClauseForClickhouse(config, {'key': 'change_requests_accepted_ratio', 'value': 'ratio'})}, + {getGroupArrayInsertAtClauseForClickhouse(config, {'key': 'change_requests_accepted', 'value': 'accepted_count'})}, + {getGroupArrayInsertAtClauseForClickhouse(config, {'key': 'change_requests_declined', 'value': 'declined_count'})} + FROM + ({inner_sql}) + GROUP BY id + {getOutterOrderAndLimit(config, 'change_requests_accepted_ratio')} + ''' + columns = ['id', 'name', 'ratio', 'accepted_count', 'declined_count'] + rst = Chaoss.__executeOuterSql(generated_sql, columns, Chaoss.__process) + return rst, generated_sql + + # Evolution - Code Development Process Quality + def chaossChangeRequests(config, mode='outer') -> (List,str): + whereClauses = ["type = 'PullRequestEvent' AND action = 'opened'"] + return Chaoss.__chaossCount(config, mode, whereClauses, 'change_requests_count') + + def chaossChangeRequestReviews(config, mode='outer') -> (List,str): + whereClauses = ["type = 'PullRequestReviewCommentEvent'"] + return Chaoss.__chaossCount(config, mode, whereClauses, 'change_requests_reviews_count') + + NewContributorsOptions = { + 'by': 'commit', #'commit' | 'change request' + 'withBot': False + } + + def chaossNewContributors(config, mode='outer') -> (List,str): + config = getMergedConfig(config) + by = filterEnumType(config.get('options').get('by') if config.get('options') != None else None, ['commit', 'change request'], 'change request') + whereClauses = [] + + endDate = datetime.date(year = config.get('endYear')+1 if config.get('endMonth')+1>12 else config.get('endYear'), month = (config.get('endMonth')+1)%12, day = 1) + + if by == 'commit': + whereClauses.append("type = 'PushEvent'") + elif by == 'change request': + whereClauses.append("type = 'PullRequestEvent' AND action = 'closed' AND pull_merged = 1") + + repoWhereClause = getRepoWhereClauseForClickhouse(config) + if repoWhereClause: + whereClauses.append(repoWhereClause) + + inner_sql = f''' + SELECT + {getGroupTimeClauseForClickhouse(config, 'first_time')}, + {getGroupIdClauseForClickhouse(config)}, + length(detail) AS new_contributor, + (arrayMap((x) -> (x), groupArray(author))) AS detail + FROM + ( + SELECT + min(created_at) AS first_time, + repo_id, + argMax(repo_name, created_at) AS repo_name, + org_id, + argMax(org_login, created_at) AS org_login, + {'author' if by == 'commit' else('actor_id, argMax(author,created_at) AS author' if by == 'change request' else '' )} + FROM + ( + SELECT + repo_id, + repo_name, + org_id, + org_login, + {'arrayJoin(push_commits.name) AS author' if by == 'commit' + else('issue_author_id AS actor_id, issue_author_login AS author' if by == 'change request' else '' )}, + created_at + FROM opensource.gh_events + WHERE {" AND ".join(whereClauses)} + {'' if config.get("options", {}).get("withBot") and by != 'commit' else "HAVING author NOT LIKE '%[bot]'"} + ) + GROUP BY repo_id, org_id, {'author' if by == 'commit' else 'actor_id'} + HAVING first_time >= toDate('{config.get('startYear')}-{config.get('startMonth')}-1') AND first_time < toDate('{endDate.year}-{endDate.month}-1') + ) + GROUP BY id, time + {getInnerOrderAndLimit(config, 'new_contributor')} + ''' + if mode == 'origin': + columns = ['time','id','name', 'new_contributor', 'detail'] + rst = Chaoss.__executeInnnerSql(inner_sql,columns) + return rst, inner_sql + + generated_sql = f''' + SELECT + id, + argMax(name, time) AS name, + {getGroupArrayInsertAtClauseForClickhouse(config, {'key': 'new_contributors', 'value': 'new_contributor'})}, + {getGroupArrayInsertAtClauseForClickhouse(config, {'key': 'detail', 'noPrecision': True, 'defaultValue': '[]'})}, + SUM(new_contributor) AS total_new_contributors + FROM + ({inner_sql}) + GROUP BY id + {getOutterOrderAndLimit(config, 'new_contributors')} + ''' + columns = ['id', 'name', 'new_contributors', 'detail', 'total_new_contributors'] + rst = Chaoss.__executeOuterSql(generated_sql, columns, Chaoss.__process) + return rst, generated_sql + + InactiveContributorsOptions = { + # time interval to determine inactive contributor, default: 6 + 'timeInterval': 6, + # time interval unit, default: month + 'timeIntervalUnit': 'month', + # determine contributor by commit or by change request + 'by': 'commit', # 'commit'| 'change request', + # min count of contributions to determine inactive contributor + 'minCount': 0, + 'withBot': False + } + + def chaossInactiveContributors(config, mode='outer') -> (List,str): + config = getMergedConfig(config) + by = filterEnumType(config.get("options", {}).get('by'), ['commit', 'change request'], 'change request') + timeInterval = config.get("options", {}).get('timeInterval', 6) + timeIntervalUnit = filterEnumType(config.get("options", {}).get('timeIntervalUnit'), ['month', 'quarter', 'year'], 'month') + minCount = config.get("options", {}).get('minCount', 0) + whereClauses = [] + + endDate = datetime.date(year = config.get('endYear')+1 if config.get('endMonth')+1>12 else config.get('endYear'), month = (config.get('endMonth')+1)%12, day = 1) + endTimeClause = f"toDate('{endDate.year}-{endDate.month}-1')" + + if by == 'commit': + whereClauses.append("type = 'PushEvent'") + elif by == 'change request': + whereClauses.append("type = 'PullRequestEvent' AND action = 'closed' AND pull_merged = 1") + + repoWhereClause = getRepoWhereClauseForClickhouse(config) + if repoWhereClause: + whereClauses.append(repoWhereClause) + + whereClauses.append(f"created_at < {endTimeClause}") + + inner_sql = f''' + SELECT + id, + argMax(name, time) AS name, + time, + countIf(first_time < time AND contributions <= {minCount}) AS inactive_contributors, + groupArrayIf(author, first_time < time AND contributions <= {minCount}) AS detail + FROM + ( + SELECT + {( + f"arrayJoin(arrayMap(x -> dateAdd({config['groupTimeRange']}, x + 1, toDate('{config['startYear']}-{config['startMonth']}-1')), " + + f"range(toUInt64(dateDiff('{config['groupTimeRange']}', toDate('{config['startYear']}-{config['startMonth']}-1'), {endTimeClause})))))" + ) if config.get('groupTimeRange') else endTimeClause} AS time, + {getGroupIdClauseForClickhouse(config)}, + {('author' if by == 'commit' else 'actor_id, argMax(author, created_at) AS author')}, + min(created_at) AS first_time, + countIf(created_at >= dateSub({timeIntervalUnit}, {timeInterval}, time) AND created_at <= time) AS contributions + FROM + ( + SELECT + repo_id, + repo_name, + org_id, + org_login, + {('arrayJoin(push_commits.name) AS author' if by == 'commit' else 'issue_author_id AS actor_id, issue_author_login AS author')}, + created_at + FROM opensource.gh_events + WHERE {' AND '.join(whereClauses)} + {(config.get('options', {}).get('withBot') and by != 'commit') and '' or "HAVING author NOT LIKE '%[bot]'"} + ) + GROUP BY id, {('author' if by == 'commit' else 'actor_id')}, time + ) + GROUP BY id, time + {getInnerOrderAndLimit(config, 'inactive_contributors')} + ''' + if mode == 'origin': + columns = ['id','name', 'time', 'inactive_contributors', 'detail'] + rst = Chaoss.__executeInnnerSql(inner_sql,columns) + return rst, inner_sql + + generated_sql = f''' + SELECT + id, + argMax(name, time) AS name, + {getGroupArrayInsertAtClauseForClickhouse(config, {'key': 'inactive_contributors', 'positionByEndTime': True})}, + {getGroupArrayInsertAtClauseForClickhouse(config, {'key': 'detail', 'noPrecision': True, 'defaultValue': '[]', 'positionByEndTime': True})} + FROM + ({inner_sql}) + GROUP BY id + {getOutterOrderAndLimit(config, 'inactive_contributors')} + ''' + columns = ['id', 'name', 'inactive_contributors', 'detail'] + rst = Chaoss.__executeOuterSql(generated_sql, columns, Chaoss.__process) + return rst, generated_sql + + InactiveContributorsOptions = { + # normalize the results by this option as max value + 'normalize': 100 + } + + def __chaossActiveDatesAndTimes(config, type, mode='outer') -> (List,str): + config = getMergedConfig(config) + whereClauses = [getTimeRangeWhereClauseForClickhouse(config)] + + if type == 'user': + userWhereClause = getUserWhereClauseForClickhouse(config) + if userWhereClause: + whereClauses.append(userWhereClause) + elif type == 'repo': + repoWhereClause = getRepoWhereClauseForClickhouse(config) + if repoWhereClause: + whereClauses.append(repoWhereClause) + else: + raise ValueError(f"Not supported type: {type}") + + inner_sql = f''' + SELECT id, argMax(name, time) AS name, time, arrayMap(x -> {f"round(x*{config.get('options', {}).get('normalize')} * max(count))" if config.get('options', {}).get('normalize') else 'x'}, + groupArrayInsertAt(0, 168)(count, toUInt32((day - 1) * 24 + hour))) AS count + FROM + ( + SELECT + {getGroupTimeClauseForClickhouse(config)}, + {getGroupIdClauseForClickhouse(config, type)}, + toHour(created_at) AS hour, + toDayOfWeek(created_at) AS day, + COUNT() AS count + FROM opensource.gh_events + WHERE {' AND '.join(whereClauses)} + GROUP BY id, time, hour, day + ORDER BY day, hour + ) + GROUP BY id, time + {getInnerOrderAndLimit(config, 'count', 1)} + ''' + if mode == 'origin': + columns = ['id','name', 'time', 'list', 'count'] + rst = Chaoss.__executeInnnerSql(inner_sql,columns) + return rst, inner_sql + + generated_sql = f''' + SELECT + id, + argMax(name, time) AS name, + {getGroupArrayInsertAtClauseForClickhouse(config, {'key': 'count', 'noPrecision': True, 'defaultValue': '[]'})} + FROM + ({inner_sql}) + GROUP BY id + {getOutterOrderAndLimit(config, 'count', 1)} + ''' + columns = ['id', 'name', 'count'] + rst = Chaoss.__executeOuterSql(generated_sql, columns, Chaoss.__process) + return rst, generated_sql + + def chaossUserActiveDatesAndTimes(config, mode='outer') -> (List,str): + return Chaoss.__chaossActiveDatesAndTimes(config, 'user', mode) + + def chaossRepoActiveDatesAndTimes(config, mode='outer') -> (List,str): + return Chaoss.__chaossActiveDatesAndTimes(config, 'repo', mode) + \ No newline at end of file diff --git a/python_v2/metrics/index.py b/python_v2/metrics/index.py new file mode 100644 index 000000000..4a3942041 --- /dev/null +++ b/python_v2/metrics/index.py @@ -0,0 +1,275 @@ +from .basic import QueryConfig, \ + getMergedConfig, \ + getRepoWhereClauseForNeo4j, \ + getTimeRangeWhereClauseForNeo4j, \ + getTimeRangeSumClauseForNeo4j, \ + getUserWhereClauseForNeo4j, \ + getRepoWhereClauseForClickhouse,\ + getUserWhereClauseForClickhouse,\ + getTimeRangeWhereClauseForClickhouse,\ + getGroupArrayInsertAtClauseForClickhouse,\ + getGroupTimeClauseForClickhouse,\ + getGroupIdClauseForClickhouse +from label_data_utils import getLabelData +from db.neo4j_wrapper import Neo4jWrapper +from db.clickhouse_wrapper import ClickhouseWrapper +from functools import cmp_to_key +import numpy as np +clickhouse = ClickhouseWrapper() +neo4j = Neo4jWrapper() + +class Index(): + def getRepoOpenrank(config): + """_summary_ + + Args: + config (QueryConfig): config of query. + Returns: + neo4j cursor: query results of neo4j + """ + config = getMergedConfig(config) + calType = 'open_rank' + repoWhereClause = getRepoWhereClauseForNeo4j(config) + timeWhereClause = getTimeRangeWhereClauseForNeo4j(config, 'r') + timeActivityOrOpenrankClause = getTimeRangeSumClauseForNeo4j(config, 'r.{}'.format(calType)) + if not config.get('groupBy'): + query = 'MATCH (r:Repo) WHERE {} {} RETURN r.name AS repo_name, r.org_login AS org, [{}] AS {} ORDER BY reverse({}) {} {};'.format(repoWhereClause+' AND ' if repoWhereClause else '', timeWhereClause, ','.join(timeActivityOrOpenrankClause), calType, calType, config.get('order'), 'LIMIT {}'.format(config.get('limit')) if config.get('limit') > 0 else '') + return neo4j.query(query) + elif config.get('groupBy') == 'org': + query = 'MATCH (r:Repo) WHERE {} {} RETURN r.org_login AS org_login, count(r.id) AS repo_count, [{}] AS {} ORDER BY reverse({}) {} {};'.format(repoWhereClause+' AND ' if repoWhereClause else '', timeWhereClause, list(map(lambda i:'round(SUM({}), {})'.format(i, config.get('percision')), timeActivityOrOpenrankClause)), calType, calType, config.get('order'), 'LIMIT {}'.format(config.get('limit')) if config.get('limit') > 0 else '') + return neo4j.query(query) + else: + query = 'MATCH (r:Repo) WHERE {} {} RETURN r.id AS repo_id, r.org_id AS org_id, [{}] AS {};'.format(repoWhereClause + ' AND ' if repoWhereClause else '', timeWhereClause, ','.join(timeActivityOrOpenrankClause), calType) + queryResult = neo4j.query(query) + labelData = list(filter(lambda l: l.get('type') == config.get('groupBy'), getLabelData())) if getLabelData() != None else None + result = {} + if labelData == None: return None + for row in queryResult: + labels = list(filter(lambda l: int(row.get('repo_id')) in l.get('githubRepos') or int(row.get('org_id')) in l.get('githubOrgs'),labelData)) + for label in labels: + if not label.get('name') in result.keys(): values = row[calType] + else: + values = result.get(label.get('name'))[calType] + for i in range(len(values)): + values[i] += row[calType][i] + result[label.get('name')] = { + 'label': label.get('name'), + 'repo_count': (result.get(label.get('name'))['repo_count'] if label.get('name') in result else 0) + 1, + } + result[label.get('name')][calType] = values + resultArr = list(result.values()) + if config.get('order') == 'ASC': resultArr.sort(key = cmp_to_key(lambda a, b: a[calType][len(a[calType]) - 1] - b[calType][len(b[calType]) - 1])) + if config.get('order') == 'DESC': resultArr.sort(key = cmp_to_key(lambda a, b: b[calType][len(b[calType]) - 1] - a[calType][len(a[calType]) - 1])) + for i in resultArr: + i[calType] = np.around(i[calType]) + return resultArr[0:config.get('limit')] + + def getUserOpenrank(config): + """_summary_ + + Args: + config (QueryConfig): config of query. + Returns: + neo4j cursor: query results of neo4j + """ + config = getMergedConfig(config) + calType = 'open_rank' + userWhereClause = getUserWhereClauseForNeo4j(config) + timeWhereClause = getTimeRangeWhereClauseForNeo4j(config, 'u') + timeActivityClause = getTimeRangeSumClauseForNeo4j(config, 'u.{}'.format(calType)) + query = 'MATCH (u:User) WHERE {} {} RETURN u.login AS user_login, [{}] AS {} ORDER BY {} {} {};'.format(userWhereClause +' AND ' if userWhereClause else '', timeWhereClause, ','.join(timeActivityClause), calType, calType, config.get('order'), 'LIMIT {}'.format(config.get('limit')) if config.get('limit') > 0 else '') + return neo4j.query(query) + + def getRepoActivity(config): + config = getMergedConfig(config) + whereClauses = ["type IN ('IssuesEvent', 'IssueCommentEvent', 'PullRequestEvent', 'PullRequestReviewCommentEvent')"] # specify types to reduce memory usage and calculation + repoWhereClause = getRepoWhereClauseForClickhouse(config) + if repoWhereClause: whereClauses.append(repoWhereClause) + whereClauses.append(getTimeRangeWhereClauseForClickhouse(config)) + sql = "SELECT id, argMax(name, time) AS name, \ + {}, \ + {}, \ + {}, \ + {}, \ + {}, \ + {} \ + FROM \ + (".format(getGroupArrayInsertAtClauseForClickhouse(config, { 'key': 'activity', 'defaultValue': '0' }), + getGroupArrayInsertAtClauseForClickhouse(config, { 'key': 'issue_comment', 'defaultValue': '0' }), + getGroupArrayInsertAtClauseForClickhouse(config, { 'key': 'open_issue', 'defaultValue': '0' }), + getGroupArrayInsertAtClauseForClickhouse(config, { 'key': 'open_pull', 'defaultValue': '0' }), + getGroupArrayInsertAtClauseForClickhouse(config, { 'key': 'review_comment', 'defaultValue': '0' }), + getGroupArrayInsertAtClauseForClickhouse(config, { 'key': 'merged_pull', 'defaultValue': '0' }), + ) + \ + "SELECT \ + {}, \ + {}, \ + ROUND(SUM(activity), 2) AS activity, \ + SUM(issue_comment) AS issue_comment, \ + SUM(open_issue) AS open_issue, \ + SUM(open_pull) AS open_pull, \ + SUM(review_comment) AS review_comment, \ + SUM(merged_pull) AS merged_pull \ + FROM \ + (".format(getGroupTimeClauseForClickhouse(config, 'month'), getGroupIdClauseForClickhouse(config, 'repo', 'month')) + \ + "SELECT \ + toStartOfMonth(created_at) AS month, \ + repo_id, argMax(repo_name, created_at) AS repo_name, \ + org_id, argMax(org_login, created_at) AS org_login, \ + if(type='PullRequestEvent' AND action='closed' AND pull_merged=1, issue_author_id, actor_id) AS actor_id, \ + countIf(type='IssueCommentEvent' AND action='created') AS issue_comment, \ + countIf(type='IssuesEvent' AND action='opened') AS open_issue, \ + countIf(type='PullRequestEvent' AND action='opened') AS open_pull, \ + countIf(type='PullRequestReviewCommentEvent' AND action='created') AS review_comment, \ + countIf(type='PullRequestEvent' AND action='closed' AND pull_merged=1) AS merged_pull, \ + sqrt({}*issue_comment + {}*open_issue + {}*open_pull + {}*review_comment + {}*merged_pull) AS activity \ + FROM opensource.gh_events \ + WHERE {} \ + GROUP BY repo_id, org_id, actor_id, month \ + HAVING activity > 0 \ + ) \ + GROUP BY id, time\ + {}\ + ) \ + GROUP BY id \ + ORDER BY activity[-1] {} \ + FORMAT JSONCompact".format(Index.ISSUE_COMMENT_WEIGHT, Index.OPEN_ISSUE_WEIGHT, + Index.OPEN_PULL_WEIGHT, Index.REVIEW_COMMENT_WEIGHT, Index.PULL_MERGED_WEIGHT, + ' AND '.join(whereClauses), + 'ORDER BY activity DESC LIMIT {} BY time'.format(config.get('limit')) if config.get('limit') > 0 else '', + config.get('order') + ) # use JSONCompact to reduce network I/O + + result = clickhouse.query(sql) + def return_row(row): + id, name, activity, issue_comment, open_issue, open_pull, review_comment, merged_pull = row + return { + 'id':id, + 'name':name, + 'activity':activity, + 'issue_comment':issue_comment, + 'open_issue':open_issue, + 'open_pull':open_pull, + 'review_comment':review_comment, + 'merged_pull':merged_pull, + } + return list(map(return_row, result)) + + def getUserActivity(config = QueryConfig, withBot = True): + config = getMergedConfig(config) + whereClauses = ["type IN ('IssuesEvent', 'IssueCommentEvent', 'PullRequestEvent', 'PullRequestReviewCommentEvent')"] # specify types to reduce memory usage and calculation + userWhereClause = getUserWhereClauseForClickhouse(config) + if userWhereClause != None: whereClauses.append(userWhereClause) + whereClauses.append(getTimeRangeWhereClauseForClickhouse(config)) + sql = "SELECT id, argMax(name, time) AS name, \ + {}, \ + {}, \ + {}, \ + {}, \ + {}, \ + {} \ + FROM \ + (".format(getGroupArrayInsertAtClauseForClickhouse(config, { 'key': 'activity', 'defaultValue': '0' }), + getGroupArrayInsertAtClauseForClickhouse(config, { 'key': 'issue_comment', 'defaultValue': '0' }), + getGroupArrayInsertAtClauseForClickhouse(config, { 'key': 'open_issue', 'defaultValue': '0' }), + getGroupArrayInsertAtClauseForClickhouse(config, { 'key': 'open_pull', 'defaultValue': '0' }), + getGroupArrayInsertAtClauseForClickhouse(config, { 'key': 'review_comment', 'defaultValue': '0' }), + getGroupArrayInsertAtClauseForClickhouse(config, { 'key': 'merged_pull', 'defaultValue': '0' }) + ) + \ + "SELECT \ + {}, \ + {}, \ + ROUND(SUM(activity), 2) AS activity, \ + SUM(issue_comment) AS issue_comment, \ + SUM(open_issue) AS open_issue, \ + SUM(open_pull) AS open_pull, \ + SUM(review_comment) AS review_comment, \ + SUM(merged_pull) AS merged_pull \ + FROM \ + (".format(getGroupTimeClauseForClickhouse(config, 'month'), getGroupIdClauseForClickhouse(config, 'actor', 'month')) + \ + "SELECT \ + toStartOfMonth(created_at) AS month, \ + repo_id, \ + if(type='PullRequestEvent' AND action='closed' AND pull_merged=1, issue_author_id, actor_id) AS actor_id, \ + argMax(if(type='PullRequestEvent' AND action='closed' AND pull_merged=1, issue_author_login, actor_login), created_at) AS actor_login, \ + countIf(type='IssueCommentEvent' AND action='created') AS issue_comment, \ + countIf(type='IssuesEvent' AND action='opened') AS open_issue, \ + countIf(type='PullRequestEvent' AND action='opened') AS open_pull, \ + countIf(type='PullRequestReviewCommentEvent' AND action='created') AS review_comment, \ + countIf(type='PullRequestEvent' AND action='closed' AND pull_merged=1) AS merged_pull, \ + sqrt({}*issue_comment + {}*open_issue + {}*open_pull + {}*review_comment + {}*merged_pull) AS activity \ + FROM opensource.gh_events \ + WHERE {} \ + GROUP BY repo_id, actor_id, month \ + HAVING activity > 0 {} \ + ) \ + GROUP BY id, time \ + {} \ + ) \ + GROUP BY id \ + ORDER BY activity[-1] {} \ + FORMAT JSONCompact".format(Index.ISSUE_COMMENT_WEIGHT, Index.OPEN_ISSUE_WEIGHT, Index.OPEN_PULL_WEIGHT, Index.REVIEW_COMMENT_WEIGHT, Index.PULL_MERGED_WEIGHT, + ' AND '.join(whereClauses), '' if withBot else 'AND actor_login NOT LIKE \'%[bot]\'', + 'ORDER BY activity DESC LIMIT {} BY time'.format(config.get('limit')) if config.get('limit') > 0 else '', + config.get('order')) + + result = clickhouse.query(sql) + def return_row(row): + id, name, activity, issue_comment, open_issue, open_pull, review_comment, merged_pull = row + return { + 'id':id, + 'name':name, + 'activity':activity, + 'issue_comment':issue_comment, + 'open_issue':open_issue, + 'open_pull':open_pull, + 'review_comment':review_comment, + 'merged_pull':merged_pull, + } + return list(map(return_row, result)) + + def getAttention(config: QueryConfig): + """_summary_ + + Args: + config (QueryConfig): _description_ + """ + config = getMergedConfig(config) + whereClauses = ["type IN (\'WatchEvent\', \'ForkEvent\')"] + repoWhereClause = getRepoWhereClauseForClickhouse(config) + if repoWhereClause != None: whereClauses.append(repoWhereClause) + whereClauses.append(getTimeRangeWhereClauseForClickhouse(config)) + + sql = ' \ + SELECT \ + id, \ + argMax(name, time) AS name, \ + {} \ + FROM \ + ('.format(getGroupArrayInsertAtClauseForClickhouse(config, { 'key': 'attention' })) + \ + 'SELECT \ + {}, \ + {}, \ + countIf(type=\'WatchEvent\') AS stars, \ + countIf(type=\'ForkEvent\') AS forks, \ + stars + 2 * forks AS attention \ + FROM opensource.gh_events \ + WHERE {} \ + GROUP BY id, time \ + {} \ + ) \ + GROUP BY id \ + ORDER BY attention[-1] {} \ + FORMAT JSONCompact'.format(getGroupTimeClauseForClickhouse(config), getGroupIdClauseForClickhouse(config), ' AND '.join(whereClauses), + 'ORDER BY attention DESC LIMIT {} BY time'.format(config.get('limit')) if config.get('limit') > 0 else '', + config.get('order')) + + result = clickhouse.query(sql) + def getResult(row): + id, name, attention = row + return { + 'id':id, + 'name':name, + 'attention':attention, + } + return list(map(getResult, result)) diff --git a/python_v2/metrics/related_users.py b/python_v2/metrics/related_users.py new file mode 100644 index 000000000..96eb2e368 --- /dev/null +++ b/python_v2/metrics/related_users.py @@ -0,0 +1,11 @@ +from .basic import getMergedConfig, getRepoWhereClauseForNeo4j, getTimeRangeWhereClauseForNeo4j +from db.neo4j_wrapper import Neo4jWrapper +neo4j = Neo4jWrapper() + +class Relation(): + def getRelatedUsers(config): + config = getMergedConfig(config) + repoWhereClause = getRepoWhereClauseForNeo4j(config) + timeWhereClause = getTimeRangeWhereClauseForNeo4j(config, 'a') + query = 'MATCH (r:Repo)<-[a:ACTION]-(u:User) WHERE {} {} RETURN DISTINCT u.login AS user_login {};'.format(repoWhereClause + ' AND ' if repoWhereClause != None else '', timeWhereClause, 'LIMIT {}'.format(config.get('limit')) if config.get('limit') > 0 else '') + return neo4j.query(query) diff --git a/python_v2/open_digger.py b/python_v2/open_digger.py new file mode 100644 index 000000000..4a583716d --- /dev/null +++ b/python_v2/open_digger.py @@ -0,0 +1,53 @@ +import label_data_utils as label +from functools import cmp_to_key +import metrics +from db.clickhouse_wrapper import ClickhouseWrapper +from db.neo4j_wrapper import Neo4jWrapper +import plotly.graph_objs as go +from plotly.subplots import make_subplots + +class openDigger(object): + def __init__(self): + self.label = label + self.render = go + self.metric = metrics.Metric() + self.clickhouse = ClickhouseWrapper() + self.neo4j = Neo4jWrapper() + + class quick(): + @classmethod + def showAll(self, repoName, startYear = 2015, endYear = 2021): + config = { 'repoNames': [repoName], 'startYear': startYear, 'endYear': endYear, 'groupTimeRange': 'month' } + activity = self.index.getRepoActivity(config) + openrank = self.index.getRepoOpenrank(config) + for year in range(startYear, endYear + 1): + for month in range(1, 13): + k = '{}{}'.format(year, month) + fig = make_subplots(specs=[[{"secondary_y": True}]]) + fig.add_trace( + openDigger().render.Scatter( + y = activity[0].get('activity'), + mode="markers+lines", + name='activity' + )) + fig.add_trace( + openDigger().render.Scatter( + y = openrank[0].get('open_rank'), + mode="markers+lines", + name='openrank' + ), secondary_y=True) + fig.update_layout( + title="Activity/OpenRank for {} from {} to {}".format(repoName, startYear, endYear), + ) + fig.show() + + def getRank(self, values, nameGetter, valueGetter): + resultMap = {} + for v in values: + resultMap[nameGetter(v)] = [] + valueLength = len(valueGetter(values[0])) + for i in range(valueLength): + values.sort(key = cmp_to_key(lambda a, b: valueGetter(b)[i] - valueGetter(a)[i])) + for index, v in enumerate(values): + resultMap.get(nameGetter(v)).append(None if valueGetter(v)[i] == 0 else index + 1) + return list(map(lambda e: {'name': e[0], 'values': e[1],}, resultMap.items()))