refactor: improve python kernel v2.0(#1375)

* refactor: improve python kernel * organize open-digger python kernel code in a more object-oriented way * refactor:python->python_v2 * refactor: save python and create python_v2 Delete config file add __init__.py add __init__.py add .gitignore
X-lab2017 · Sep 25, 2023 · 90ad91c · 90ad91c
1 parent fafcdd2
commit 90ad91c
Show file tree

Hide file tree

Showing 12 changed files with 1,812 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -18,8 +18,12 @@ node_modules
 sample_data/data
 
 # Ignore python
-*/**/__init__.py
 */**/__pycache__
 python/workspace/*
 python/workspace.py
 python/local_config.py
+
+# Ignore python_v2
+python_v2/workspace/*
+python_v2/workspace.py
+python_v2/local_config.py
diff --git a/python_v2/README.md b/python_v2/README.md
@@ -0,0 +1,52 @@
+# Getting Start
+
+## If you want to do some data analysis work:
+Start your ClickHouse container, which should be set up in [Clickhouse-sample-data](../sample_data/README.md)
+
+1. Clone OpenDigger `git clone https://github.com/X-lab2017/open-digger.git`
+
+2. Enter the repo path `cd open-digger`
+
+3. Go to the `python` folder in the open-digger root directory, create a file named 'local_config.py'(this file has already added into `.gitignore` file.) for Python Kernel with the following contents:
+
+   ```python
+   local_config = {
+     'db': {
+       'clickhouse': {
+         'host':'172.17.0.1', 
+         'user':'default'
+       },
+       'neo4j':{
+         'port': '7687',
+       }
+     }
+   }
+   ```
+   the `host` above is the host of the ClickHouse server. We can find it using `docker inspect containert_name`, and copy the `Gateway` like this:
+
+   ```shell
+   $ docker inspect container_name | grep Gateway
+               "Gateway": "172.17.0.1",
+               "IPv6Gateway": "",
+                       "Gateway": "172.17.0.1",
+                       "IPv6Gateway": "",
+   ```
+    If you use your own data, you can also change `host` field to your own host IP
+4. Use `docker build -t opendigger-jupyter-python:1.0 $(pwd)` to make a docker image, this image is based on `miniconda`. You can check the `Dockerfile` in root directory.
+
+   > If you are using **Windows CMD**, all the `$(pwd)` here should be replaced by `%cd%`. And if you are using **Windows Powershell**,  all the `$(pwd)` here should be replaced by `${pwd}`.
+   >
+   > **Notice:** Pathnames of directories like "pwd" may use `\` to join the directory in some versions of Windows. We recommend using absolute paths.
+
+5. Then we can use `docker run -it --name python_notebook_name --rm -p 8888:8888 -v $(pwd):/python_kernel/notebook opendigger-jupyter-python:1.0` to create and run the container.
+
+6. Open the link in console log like `http://127.0.0.1:8888/lab?token=xxxxx`.
+
+7. If the source code under `python` folder changed, you need to stop the notebook docker using `docker stop python_notebook_name` and restart the notebook kernel using `docker run -it --name python_notebook_name --rm -p 8888:8888 -v $(pwd):/python_kernel/notebook opendigger-jupyter-python:1.0` to reload the sorce code.
+
+8. You can find the notebook folder, where we provide demos in the handbook. You can create a new file, and happy data exploring!
+    Attention: you need to do this work in `notebook` or other parallel folder. If you run in root directory, it can't work because of python import rules.
+
+## If you are a developer:
+
+You can also make `workspace.py` in `python` folder. and run it.
diff --git a/python_v2/config.py b/python_v2/config.py
@@ -0,0 +1,50 @@
+inited = False
+config = {
+  'general': {
+    'owner': 'X-lab2017',
+    'repo': 'OpenDigger',
+    'baseUrl': 'http://open-digger.opensource-service.cn/',
+  },
+  'db': {
+    'clickhouse': {
+      'host': 'localhost', #python里的clickhouse_driver用的tcp端口9000
+      'port': '9000',
+      'user': '',
+      'password': '',
+      'protocol': 'http:',
+      'format': 'JSON',
+      'database': 'opensource',
+    },
+    'neo4j': {
+      'host':'neo4j://localhost:7687',
+    }
+  },
+  'oss': {
+    'ali': {
+      'region': '',
+      'accessKeyId': '',
+      'accessKeySecret': '',
+      'bucket': '',
+    }
+  },
+  'ci': {
+    'token':'',
+  }
+}
+def mergeConfig(base_config, local_config):
+    for key, val in local_config.items():
+            if isinstance(val, dict):
+                mergeConfig(base_config[key], val)
+            else:
+                base_config[key] = val
+    return base_config
+def getConfig():
+    global config
+    if not inited: 
+        try:
+            from local_config import local_config
+            config = mergeConfig(config, local_config)
+            return config
+        except:
+          return config
+    return config
diff --git a/python_v2/db/clickhouse_wrapper.py b/python_v2/db/clickhouse_wrapper.py
@@ -0,0 +1,24 @@
+from easydict import EasyDict
+from config import getConfig
+from clickhouse_driver import Client
+
+class ClickhouseWrapper(object):
+    def __init__(self):
+        if not hasattr(ClickhouseWrapper, "_first_init"):
+            config = EasyDict(getConfig()).db.clickhouse
+            try:
+                self.client = Client(config.host, config.port, config.database, config.user, config.password)
+            except :
+                print("CLICKHOUSE INIT FAILED")
+    def __new__(cls, *args, **kwargs):
+
+        if not hasattr(ClickhouseWrapper, "_instance" ):
+            ClickhouseWrapper._instance = object.__new__(cls)
+        return ClickhouseWrapper._instance
+
+
+    def query(self, q):
+        return self.client.execute(q)
+
+    def queryDataframe(self,q):
+        return self.client.query_dataframe(q)
diff --git a/python_v2/db/neo4j_wrapper.py b/python_v2/db/neo4j_wrapper.py
@@ -0,0 +1,23 @@
+from py2neo import Graph
+from easydict import EasyDict
+from config import getConfig
+
+class Neo4jWrapper(object):
+    def __init__(self):
+        neo4j_config = EasyDict(getConfig()).db.neo4j
+        # self.driver = Graph(neo4j_config.host)
+        try:
+            self.driver = Graph(neo4j_config.host)
+        except Exception as e:
+            print(e)
+            print("NEO4J INIT ERROR")
+
+    def __new__(cls, *args, **kwargs):
+
+        if not hasattr(Neo4jWrapper, "_instance" ):
+            Neo4jWrapper._instance = object.__new__(cls)
+        return Neo4jWrapper._instance
+
+    def query(self, query_sql):
+        result = self.driver.run(query_sql) # return a cursor object
+        return result.data()
diff --git a/python_v2/label_data_utils.py b/python_v2/label_data_utils.py
@@ -0,0 +1,175 @@
+import os
+import yaml
+import platform
+from typing import List
+labelInputDir = '../labeled_data'
+labelInputPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), labelInputDir)
+
+supportedTypes = set(['Region', 'Company', 'Community', 'Project', 'Foundation','Tech-0', 'Tech-1', 'Tech-2','Domain-0', 'Bot'])
+
+supportedKey = set(['label', 'github_repo', 'github_org', 'github_user'])
+GitHubData = {
+  'githubRepos': [],
+  'githubOrgs': [],
+  'githubUsers': [],
+}
+
+emptyData = {
+  'githubRepos': [],
+  'githubOrgs': [],
+  'githubUsers': [],
+}
+
+LabelItem = {
+  'identifier': '',
+  'content': {
+    'name': '',
+    'type': '',
+    'data': '',
+  },
+  'parsed': True
+}
+LabelItem.update(GitHubData)
+
+ParsedLabelItem = {
+  'identifier': '',
+  'type': '',
+  'name': ''
+}
+ParsedLabelItem.update(GitHubData)
+
+def getLabelData():
+    if not os.path.isdir(labelInputPath):
+        print('{} input path is not a directory.'.format(labelInputPath))
+        return []
+    labelMap = {} #<string, LabelItem>()
+    indexFileName = '{}index.yml'.format(os.path.sep)
+    labelFileSuffix = '.yml'
+    def getfileProcessor(f):
+        if not f.endswith('.yml'): return
+        # convert windows favor path to linux favor path
+
+        identifier = processLabelIdentifier(':{}'.format(f[0:f.find(indexFileName)] if f.endswith(indexFileName) else f[0:f.find(labelFileSuffix)]))
+        content = open(os.path.join(labelInputPath, f),encoding='utf-8').read()
+        content = yaml.load(content,Loader=yaml.FullLoader)
+        labelMap[identifier] = {
+          'identifier':identifier,
+          'content':content,
+          'parsed': False,
+          'githubOrgs': [],
+          'githubRepos': [],
+          'githubUsers': [],
+        }
+
+    readPath(labelInputPath, '', getfileProcessor)
+    data = processLabelItems(labelMap)
+    return data
+
+def readPath(p, base, fileProcessor):
+    """_summary_
+
+    Args:
+        p (string): _description_
+        base (string): _description_
+        fileProcessor(f:string)->void.
+    """
+    if not os.path.isdir(p):
+        fileProcessor(base)
+    else:
+        for f in os.listdir(p):
+            readPath(os.path.join(p, f), os.path.join(base, f), fileProcessor)
+
+def processLabelItems(map_item)->List:
+  """_summary_
+
+  Args:
+      map_item (Map<string, LabelItem>): _description_
+      LabelItem (_type_): _description_
+
+  Returns:
+      ParsedLabelItem[]: _description_
+  """
+  for item in map_item.values():
+      parseItem(item, map_item)
+  return list(map(lambda item: {'identifier': item.get('identifier'),
+                                  'type': item.get('content').get('type'),
+                                  'name': item.get('content').get('name'),
+                                  'githubRepos': list(set(item.get('githubRepos'))),
+                                  'githubOrgs': list(set(item.get('githubOrgs'))),
+                                  'githubUsers': list(set(item.get('githubUsers'))),
+                                  }, list(map_item.values())))
+
+def parseItem(item, map_item):
+    """_summary_
+
+    Args:
+        item (LabelItem): _description_
+        map_item (Map<string, LabelItem>): _description_
+    """
+    if item.get('parsed'): return
+    if item.get('content').get('type') and item.get('content').get('type') not in supportedTypes:
+        raise Exception('Not supported type {}'.format(item.get('content').get('type')))
+    for key in item.get('content').get('data'): 
+        if not key in supportedKey:
+            raise Exception('Not supported element={}, identifier={}').format(key, item.get('identifier'))
+        if key == 'github_repo':
+            item.get('githubRepos').extend(x for x in item.get('content').get('data')[key])
+        elif key == 'github_org':
+            item.get('githubOrgs').extend(x for x in item.get('content').get('data')[key])
+        elif key == 'github_user':
+            item.get('githubUsers').extend(x for x in item.get('content').get('data')[key])
+        elif key == 'label':
+            labels = item.get('content').get('data')[key]
+            for label in labels:
+                identifier = label if label.startswith(':') else processLabelIdentifier(os.path.join(item.get('identifier'), label))
+                innerItem = map_item.get(identifier)
+                if innerItem == None:
+                    raise Exception('Can not find nest identifier {} for {}'.format(identifier, item.get('identifier')))
+                if not innerItem.get('parsed'):
+                    parseItem(innerItem, map_item)
+                item.get('githubOrgs').extend(x for x in innerItem.get('githubOrgs'))
+                item.get('githubRepos').extend(x for x in innerItem.get('githubRepos'))
+                item.get('githubUsers').extend(x for x in innerItem.get('githubUsers'))
+    item['parsed'] = True
+
+def processLabelIdentifier(identifier: str)-> str:
+    if platform.system() == 'Windows':
+        return os.path.altsep.join(identifier.split(os.path.sep))
+    else: return identifier
+
+def labelDataToGitHubData(data)->GitHubData:
+    """_summary_
+
+    Args:
+        data (list of ParsedLabelItem): _description_
+
+    Returns:
+        GitHubData: _description_
+    """
+    repoSet = set([])
+    orgSet = set([])
+    userSet = set([])
+    for item in data:
+        for r in item.get('githubRepos'): repoSet.add(r)
+        for o in item.get('githubOrgs'): orgSet.add(o)
+        for u in item.get('githubUsers'): userSet.add(u)
+    return {
+      "githubRepos": list(repoSet),
+      "githubOrgs": list(orgSet),
+      "githubUsers": list(userSet),
+    }
+
+def getGitHubData(typeOrIds: List)-> GitHubData:
+    """_summary_
+
+    Args:
+        typeOrIds (List<str>): _description_
+
+    Returns:
+        GitHubData: _description_
+    """
+    if len(typeOrIds) == 0: return emptyData
+    data = getLabelData()
+    if data == None: return emptyData
+    arr = list(filter(lambda i: i.get('type') in typeOrIds or i.get('identifier') in typeOrIds, data))
+    return labelDataToGitHubData(arr)
diff --git a/python_v2/metrics/__init__.py b/python_v2/metrics/__init__.py
@@ -0,0 +1,4 @@
+class Metric(object):
+    from metrics.chaoss import Chaoss as chaoss
+    from metrics.index import Index as index
+    from metrics.related_users import Relation as relation