Skip to content

Commit

Permalink
refactor: improve python kernel v2.0(#1375)
Browse files Browse the repository at this point in the history
* refactor: improve python kernel

* organize open-digger python kernel code in a more object-oriented way

* refactor:python->python_v2

* refactor: save python and create python_v2

Delete config file

add __init__.py

add __init__.py

add .gitignore
  • Loading branch information
l1tok authored Sep 25, 2023
1 parent fafcdd2 commit 90ad91c
Show file tree
Hide file tree
Showing 12 changed files with 1,812 additions and 1 deletion.
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@ node_modules
sample_data/data

# Ignore python
*/**/__init__.py
*/**/__pycache__
python/workspace/*
python/workspace.py
python/local_config.py

# Ignore python_v2
python_v2/workspace/*
python_v2/workspace.py
python_v2/local_config.py
52 changes: 52 additions & 0 deletions python_v2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Getting Start

## If you want to do some data analysis work:
Start your ClickHouse container, which should be set up in [Clickhouse-sample-data](../sample_data/README.md)

1. Clone OpenDigger `git clone https://github.com/X-lab2017/open-digger.git`

2. Enter the repo path `cd open-digger`

3. Go to the `python` folder in the open-digger root directory, create a file named 'local_config.py'(this file has already added into `.gitignore` file.) for Python Kernel with the following contents:

```python
local_config = {
'db': {
'clickhouse': {
'host':'172.17.0.1',
'user':'default'
},
'neo4j':{
'port': '7687',
}
}
}
```
the `host` above is the host of the ClickHouse server. We can find it using `docker inspect containert_name`, and copy the `Gateway` like this:

```shell
$ docker inspect container_name | grep Gateway
"Gateway": "172.17.0.1",
"IPv6Gateway": "",
"Gateway": "172.17.0.1",
"IPv6Gateway": "",
```
If you use your own data, you can also change `host` field to your own host IP
4. Use `docker build -t opendigger-jupyter-python:1.0 $(pwd)` to make a docker image, this image is based on `miniconda`. You can check the `Dockerfile` in root directory.

> If you are using **Windows CMD**, all the `$(pwd)` here should be replaced by `%cd%`. And if you are using **Windows Powershell**, all the `$(pwd)` here should be replaced by `${pwd}`.
>
> **Notice:** Pathnames of directories like "pwd" may use `\` to join the directory in some versions of Windows. We recommend using absolute paths.
5. Then we can use `docker run -it --name python_notebook_name --rm -p 8888:8888 -v $(pwd):/python_kernel/notebook opendigger-jupyter-python:1.0` to create and run the container.

6. Open the link in console log like `http://127.0.0.1:8888/lab?token=xxxxx`.

7. If the source code under `python` folder changed, you need to stop the notebook docker using `docker stop python_notebook_name` and restart the notebook kernel using `docker run -it --name python_notebook_name --rm -p 8888:8888 -v $(pwd):/python_kernel/notebook opendigger-jupyter-python:1.0` to reload the sorce code.

8. You can find the notebook folder, where we provide demos in the handbook. You can create a new file, and happy data exploring!
Attention: you need to do this work in `notebook` or other parallel folder. If you run in root directory, it can't work because of python import rules.

## If you are a developer:

You can also make `workspace.py` in `python` folder. and run it.
50 changes: 50 additions & 0 deletions python_v2/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
inited = False
config = {
'general': {
'owner': 'X-lab2017',
'repo': 'OpenDigger',
'baseUrl': 'http://open-digger.opensource-service.cn/',
},
'db': {
'clickhouse': {
'host': 'localhost', #python里的clickhouse_driver用的tcp端口9000
'port': '9000',
'user': '',
'password': '',
'protocol': 'http:',
'format': 'JSON',
'database': 'opensource',
},
'neo4j': {
'host':'neo4j://localhost:7687',
}
},
'oss': {
'ali': {
'region': '',
'accessKeyId': '',
'accessKeySecret': '',
'bucket': '',
}
},
'ci': {
'token':'',
}
}
def mergeConfig(base_config, local_config):
for key, val in local_config.items():
if isinstance(val, dict):
mergeConfig(base_config[key], val)
else:
base_config[key] = val
return base_config
def getConfig():
global config
if not inited:
try:
from local_config import local_config
config = mergeConfig(config, local_config)
return config
except:
return config
return config
24 changes: 24 additions & 0 deletions python_v2/db/clickhouse_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from easydict import EasyDict
from config import getConfig
from clickhouse_driver import Client

class ClickhouseWrapper(object):
def __init__(self):
if not hasattr(ClickhouseWrapper, "_first_init"):
config = EasyDict(getConfig()).db.clickhouse
try:
self.client = Client(config.host, config.port, config.database, config.user, config.password)
except :
print("CLICKHOUSE INIT FAILED")
def __new__(cls, *args, **kwargs):

if not hasattr(ClickhouseWrapper, "_instance" ):
ClickhouseWrapper._instance = object.__new__(cls)
return ClickhouseWrapper._instance


def query(self, q):
return self.client.execute(q)

def queryDataframe(self,q):
return self.client.query_dataframe(q)
23 changes: 23 additions & 0 deletions python_v2/db/neo4j_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from py2neo import Graph
from easydict import EasyDict
from config import getConfig

class Neo4jWrapper(object):
def __init__(self):
neo4j_config = EasyDict(getConfig()).db.neo4j
# self.driver = Graph(neo4j_config.host)
try:
self.driver = Graph(neo4j_config.host)
except Exception as e:
print(e)
print("NEO4J INIT ERROR")

def __new__(cls, *args, **kwargs):

if not hasattr(Neo4jWrapper, "_instance" ):
Neo4jWrapper._instance = object.__new__(cls)
return Neo4jWrapper._instance

def query(self, query_sql):
result = self.driver.run(query_sql) # return a cursor object
return result.data()
175 changes: 175 additions & 0 deletions python_v2/label_data_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import os
import yaml
import platform
from typing import List
labelInputDir = '../labeled_data'
labelInputPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), labelInputDir)

supportedTypes = set(['Region', 'Company', 'Community', 'Project', 'Foundation','Tech-0', 'Tech-1', 'Tech-2','Domain-0', 'Bot'])

supportedKey = set(['label', 'github_repo', 'github_org', 'github_user'])
GitHubData = {
'githubRepos': [],
'githubOrgs': [],
'githubUsers': [],
}

emptyData = {
'githubRepos': [],
'githubOrgs': [],
'githubUsers': [],
}

LabelItem = {
'identifier': '',
'content': {
'name': '',
'type': '',
'data': '',
},
'parsed': True
}
LabelItem.update(GitHubData)

ParsedLabelItem = {
'identifier': '',
'type': '',
'name': ''
}
ParsedLabelItem.update(GitHubData)

def getLabelData():
if not os.path.isdir(labelInputPath):
print('{} input path is not a directory.'.format(labelInputPath))
return []
labelMap = {} #<string, LabelItem>()
indexFileName = '{}index.yml'.format(os.path.sep)
labelFileSuffix = '.yml'
def getfileProcessor(f):
if not f.endswith('.yml'): return
# convert windows favor path to linux favor path

identifier = processLabelIdentifier(':{}'.format(f[0:f.find(indexFileName)] if f.endswith(indexFileName) else f[0:f.find(labelFileSuffix)]))
content = open(os.path.join(labelInputPath, f),encoding='utf-8').read()
content = yaml.load(content,Loader=yaml.FullLoader)
labelMap[identifier] = {
'identifier':identifier,
'content':content,
'parsed': False,
'githubOrgs': [],
'githubRepos': [],
'githubUsers': [],
}

readPath(labelInputPath, '', getfileProcessor)
data = processLabelItems(labelMap)
return data

def readPath(p, base, fileProcessor):
"""_summary_
Args:
p (string): _description_
base (string): _description_
fileProcessor(f:string)->void.
"""
if not os.path.isdir(p):
fileProcessor(base)
else:
for f in os.listdir(p):
readPath(os.path.join(p, f), os.path.join(base, f), fileProcessor)

def processLabelItems(map_item)->List:
"""_summary_
Args:
map_item (Map<string, LabelItem>): _description_
LabelItem (_type_): _description_
Returns:
ParsedLabelItem[]: _description_
"""
for item in map_item.values():
parseItem(item, map_item)
return list(map(lambda item: {'identifier': item.get('identifier'),
'type': item.get('content').get('type'),
'name': item.get('content').get('name'),
'githubRepos': list(set(item.get('githubRepos'))),
'githubOrgs': list(set(item.get('githubOrgs'))),
'githubUsers': list(set(item.get('githubUsers'))),
}, list(map_item.values())))

def parseItem(item, map_item):
"""_summary_
Args:
item (LabelItem): _description_
map_item (Map<string, LabelItem>): _description_
"""
if item.get('parsed'): return
if item.get('content').get('type') and item.get('content').get('type') not in supportedTypes:
raise Exception('Not supported type {}'.format(item.get('content').get('type')))
for key in item.get('content').get('data'):
if not key in supportedKey:
raise Exception('Not supported element={}, identifier={}').format(key, item.get('identifier'))
if key == 'github_repo':
item.get('githubRepos').extend(x for x in item.get('content').get('data')[key])
elif key == 'github_org':
item.get('githubOrgs').extend(x for x in item.get('content').get('data')[key])
elif key == 'github_user':
item.get('githubUsers').extend(x for x in item.get('content').get('data')[key])
elif key == 'label':
labels = item.get('content').get('data')[key]
for label in labels:
identifier = label if label.startswith(':') else processLabelIdentifier(os.path.join(item.get('identifier'), label))
innerItem = map_item.get(identifier)
if innerItem == None:
raise Exception('Can not find nest identifier {} for {}'.format(identifier, item.get('identifier')))
if not innerItem.get('parsed'):
parseItem(innerItem, map_item)
item.get('githubOrgs').extend(x for x in innerItem.get('githubOrgs'))
item.get('githubRepos').extend(x for x in innerItem.get('githubRepos'))
item.get('githubUsers').extend(x for x in innerItem.get('githubUsers'))
item['parsed'] = True

def processLabelIdentifier(identifier: str)-> str:
if platform.system() == 'Windows':
return os.path.altsep.join(identifier.split(os.path.sep))
else: return identifier

def labelDataToGitHubData(data)->GitHubData:
"""_summary_
Args:
data (list of ParsedLabelItem): _description_
Returns:
GitHubData: _description_
"""
repoSet = set([])
orgSet = set([])
userSet = set([])
for item in data:
for r in item.get('githubRepos'): repoSet.add(r)
for o in item.get('githubOrgs'): orgSet.add(o)
for u in item.get('githubUsers'): userSet.add(u)
return {
"githubRepos": list(repoSet),
"githubOrgs": list(orgSet),
"githubUsers": list(userSet),
}

def getGitHubData(typeOrIds: List)-> GitHubData:
"""_summary_
Args:
typeOrIds (List<str>): _description_
Returns:
GitHubData: _description_
"""
if len(typeOrIds) == 0: return emptyData
data = getLabelData()
if data == None: return emptyData
arr = list(filter(lambda i: i.get('type') in typeOrIds or i.get('identifier') in typeOrIds, data))
return labelDataToGitHubData(arr)
4 changes: 4 additions & 0 deletions python_v2/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
class Metric(object):
from metrics.chaoss import Chaoss as chaoss
from metrics.index import Index as index
from metrics.related_users import Relation as relation
Loading

0 comments on commit 90ad91c

Please sign in to comment.