-
Notifications
You must be signed in to change notification settings - Fork 90
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: improve python kernel v2.0(#1375)
* refactor: improve python kernel * organize open-digger python kernel code in a more object-oriented way * refactor:python->python_v2 * refactor: save python and create python_v2 Delete config file add __init__.py add __init__.py add .gitignore
- Loading branch information
Showing
12 changed files
with
1,812 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# Getting Start | ||
|
||
## If you want to do some data analysis work: | ||
Start your ClickHouse container, which should be set up in [Clickhouse-sample-data](../sample_data/README.md) | ||
|
||
1. Clone OpenDigger `git clone https://github.com/X-lab2017/open-digger.git` | ||
|
||
2. Enter the repo path `cd open-digger` | ||
|
||
3. Go to the `python` folder in the open-digger root directory, create a file named 'local_config.py'(this file has already added into `.gitignore` file.) for Python Kernel with the following contents: | ||
|
||
```python | ||
local_config = { | ||
'db': { | ||
'clickhouse': { | ||
'host':'172.17.0.1', | ||
'user':'default' | ||
}, | ||
'neo4j':{ | ||
'port': '7687', | ||
} | ||
} | ||
} | ||
``` | ||
the `host` above is the host of the ClickHouse server. We can find it using `docker inspect containert_name`, and copy the `Gateway` like this: | ||
|
||
```shell | ||
$ docker inspect container_name | grep Gateway | ||
"Gateway": "172.17.0.1", | ||
"IPv6Gateway": "", | ||
"Gateway": "172.17.0.1", | ||
"IPv6Gateway": "", | ||
``` | ||
If you use your own data, you can also change `host` field to your own host IP | ||
4. Use `docker build -t opendigger-jupyter-python:1.0 $(pwd)` to make a docker image, this image is based on `miniconda`. You can check the `Dockerfile` in root directory. | ||
|
||
> If you are using **Windows CMD**, all the `$(pwd)` here should be replaced by `%cd%`. And if you are using **Windows Powershell**, all the `$(pwd)` here should be replaced by `${pwd}`. | ||
> | ||
> **Notice:** Pathnames of directories like "pwd" may use `\` to join the directory in some versions of Windows. We recommend using absolute paths. | ||
5. Then we can use `docker run -it --name python_notebook_name --rm -p 8888:8888 -v $(pwd):/python_kernel/notebook opendigger-jupyter-python:1.0` to create and run the container. | ||
|
||
6. Open the link in console log like `http://127.0.0.1:8888/lab?token=xxxxx`. | ||
|
||
7. If the source code under `python` folder changed, you need to stop the notebook docker using `docker stop python_notebook_name` and restart the notebook kernel using `docker run -it --name python_notebook_name --rm -p 8888:8888 -v $(pwd):/python_kernel/notebook opendigger-jupyter-python:1.0` to reload the sorce code. | ||
|
||
8. You can find the notebook folder, where we provide demos in the handbook. You can create a new file, and happy data exploring! | ||
Attention: you need to do this work in `notebook` or other parallel folder. If you run in root directory, it can't work because of python import rules. | ||
|
||
## If you are a developer: | ||
|
||
You can also make `workspace.py` in `python` folder. and run it. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
inited = False | ||
config = { | ||
'general': { | ||
'owner': 'X-lab2017', | ||
'repo': 'OpenDigger', | ||
'baseUrl': 'http://open-digger.opensource-service.cn/', | ||
}, | ||
'db': { | ||
'clickhouse': { | ||
'host': 'localhost', #python里的clickhouse_driver用的tcp端口9000 | ||
'port': '9000', | ||
'user': '', | ||
'password': '', | ||
'protocol': 'http:', | ||
'format': 'JSON', | ||
'database': 'opensource', | ||
}, | ||
'neo4j': { | ||
'host':'neo4j://localhost:7687', | ||
} | ||
}, | ||
'oss': { | ||
'ali': { | ||
'region': '', | ||
'accessKeyId': '', | ||
'accessKeySecret': '', | ||
'bucket': '', | ||
} | ||
}, | ||
'ci': { | ||
'token':'', | ||
} | ||
} | ||
def mergeConfig(base_config, local_config): | ||
for key, val in local_config.items(): | ||
if isinstance(val, dict): | ||
mergeConfig(base_config[key], val) | ||
else: | ||
base_config[key] = val | ||
return base_config | ||
def getConfig(): | ||
global config | ||
if not inited: | ||
try: | ||
from local_config import local_config | ||
config = mergeConfig(config, local_config) | ||
return config | ||
except: | ||
return config | ||
return config |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from easydict import EasyDict | ||
from config import getConfig | ||
from clickhouse_driver import Client | ||
|
||
class ClickhouseWrapper(object): | ||
def __init__(self): | ||
if not hasattr(ClickhouseWrapper, "_first_init"): | ||
config = EasyDict(getConfig()).db.clickhouse | ||
try: | ||
self.client = Client(config.host, config.port, config.database, config.user, config.password) | ||
except : | ||
print("CLICKHOUSE INIT FAILED") | ||
def __new__(cls, *args, **kwargs): | ||
|
||
if not hasattr(ClickhouseWrapper, "_instance" ): | ||
ClickhouseWrapper._instance = object.__new__(cls) | ||
return ClickhouseWrapper._instance | ||
|
||
|
||
def query(self, q): | ||
return self.client.execute(q) | ||
|
||
def queryDataframe(self,q): | ||
return self.client.query_dataframe(q) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from py2neo import Graph | ||
from easydict import EasyDict | ||
from config import getConfig | ||
|
||
class Neo4jWrapper(object): | ||
def __init__(self): | ||
neo4j_config = EasyDict(getConfig()).db.neo4j | ||
# self.driver = Graph(neo4j_config.host) | ||
try: | ||
self.driver = Graph(neo4j_config.host) | ||
except Exception as e: | ||
print(e) | ||
print("NEO4J INIT ERROR") | ||
|
||
def __new__(cls, *args, **kwargs): | ||
|
||
if not hasattr(Neo4jWrapper, "_instance" ): | ||
Neo4jWrapper._instance = object.__new__(cls) | ||
return Neo4jWrapper._instance | ||
|
||
def query(self, query_sql): | ||
result = self.driver.run(query_sql) # return a cursor object | ||
return result.data() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
import os | ||
import yaml | ||
import platform | ||
from typing import List | ||
labelInputDir = '../labeled_data' | ||
labelInputPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), labelInputDir) | ||
|
||
supportedTypes = set(['Region', 'Company', 'Community', 'Project', 'Foundation','Tech-0', 'Tech-1', 'Tech-2','Domain-0', 'Bot']) | ||
|
||
supportedKey = set(['label', 'github_repo', 'github_org', 'github_user']) | ||
GitHubData = { | ||
'githubRepos': [], | ||
'githubOrgs': [], | ||
'githubUsers': [], | ||
} | ||
|
||
emptyData = { | ||
'githubRepos': [], | ||
'githubOrgs': [], | ||
'githubUsers': [], | ||
} | ||
|
||
LabelItem = { | ||
'identifier': '', | ||
'content': { | ||
'name': '', | ||
'type': '', | ||
'data': '', | ||
}, | ||
'parsed': True | ||
} | ||
LabelItem.update(GitHubData) | ||
|
||
ParsedLabelItem = { | ||
'identifier': '', | ||
'type': '', | ||
'name': '' | ||
} | ||
ParsedLabelItem.update(GitHubData) | ||
|
||
def getLabelData(): | ||
if not os.path.isdir(labelInputPath): | ||
print('{} input path is not a directory.'.format(labelInputPath)) | ||
return [] | ||
labelMap = {} #<string, LabelItem>() | ||
indexFileName = '{}index.yml'.format(os.path.sep) | ||
labelFileSuffix = '.yml' | ||
def getfileProcessor(f): | ||
if not f.endswith('.yml'): return | ||
# convert windows favor path to linux favor path | ||
|
||
identifier = processLabelIdentifier(':{}'.format(f[0:f.find(indexFileName)] if f.endswith(indexFileName) else f[0:f.find(labelFileSuffix)])) | ||
content = open(os.path.join(labelInputPath, f),encoding='utf-8').read() | ||
content = yaml.load(content,Loader=yaml.FullLoader) | ||
labelMap[identifier] = { | ||
'identifier':identifier, | ||
'content':content, | ||
'parsed': False, | ||
'githubOrgs': [], | ||
'githubRepos': [], | ||
'githubUsers': [], | ||
} | ||
|
||
readPath(labelInputPath, '', getfileProcessor) | ||
data = processLabelItems(labelMap) | ||
return data | ||
|
||
def readPath(p, base, fileProcessor): | ||
"""_summary_ | ||
Args: | ||
p (string): _description_ | ||
base (string): _description_ | ||
fileProcessor(f:string)->void. | ||
""" | ||
if not os.path.isdir(p): | ||
fileProcessor(base) | ||
else: | ||
for f in os.listdir(p): | ||
readPath(os.path.join(p, f), os.path.join(base, f), fileProcessor) | ||
|
||
def processLabelItems(map_item)->List: | ||
"""_summary_ | ||
Args: | ||
map_item (Map<string, LabelItem>): _description_ | ||
LabelItem (_type_): _description_ | ||
Returns: | ||
ParsedLabelItem[]: _description_ | ||
""" | ||
for item in map_item.values(): | ||
parseItem(item, map_item) | ||
return list(map(lambda item: {'identifier': item.get('identifier'), | ||
'type': item.get('content').get('type'), | ||
'name': item.get('content').get('name'), | ||
'githubRepos': list(set(item.get('githubRepos'))), | ||
'githubOrgs': list(set(item.get('githubOrgs'))), | ||
'githubUsers': list(set(item.get('githubUsers'))), | ||
}, list(map_item.values()))) | ||
|
||
def parseItem(item, map_item): | ||
"""_summary_ | ||
Args: | ||
item (LabelItem): _description_ | ||
map_item (Map<string, LabelItem>): _description_ | ||
""" | ||
if item.get('parsed'): return | ||
if item.get('content').get('type') and item.get('content').get('type') not in supportedTypes: | ||
raise Exception('Not supported type {}'.format(item.get('content').get('type'))) | ||
for key in item.get('content').get('data'): | ||
if not key in supportedKey: | ||
raise Exception('Not supported element={}, identifier={}').format(key, item.get('identifier')) | ||
if key == 'github_repo': | ||
item.get('githubRepos').extend(x for x in item.get('content').get('data')[key]) | ||
elif key == 'github_org': | ||
item.get('githubOrgs').extend(x for x in item.get('content').get('data')[key]) | ||
elif key == 'github_user': | ||
item.get('githubUsers').extend(x for x in item.get('content').get('data')[key]) | ||
elif key == 'label': | ||
labels = item.get('content').get('data')[key] | ||
for label in labels: | ||
identifier = label if label.startswith(':') else processLabelIdentifier(os.path.join(item.get('identifier'), label)) | ||
innerItem = map_item.get(identifier) | ||
if innerItem == None: | ||
raise Exception('Can not find nest identifier {} for {}'.format(identifier, item.get('identifier'))) | ||
if not innerItem.get('parsed'): | ||
parseItem(innerItem, map_item) | ||
item.get('githubOrgs').extend(x for x in innerItem.get('githubOrgs')) | ||
item.get('githubRepos').extend(x for x in innerItem.get('githubRepos')) | ||
item.get('githubUsers').extend(x for x in innerItem.get('githubUsers')) | ||
item['parsed'] = True | ||
|
||
def processLabelIdentifier(identifier: str)-> str: | ||
if platform.system() == 'Windows': | ||
return os.path.altsep.join(identifier.split(os.path.sep)) | ||
else: return identifier | ||
|
||
def labelDataToGitHubData(data)->GitHubData: | ||
"""_summary_ | ||
Args: | ||
data (list of ParsedLabelItem): _description_ | ||
Returns: | ||
GitHubData: _description_ | ||
""" | ||
repoSet = set([]) | ||
orgSet = set([]) | ||
userSet = set([]) | ||
for item in data: | ||
for r in item.get('githubRepos'): repoSet.add(r) | ||
for o in item.get('githubOrgs'): orgSet.add(o) | ||
for u in item.get('githubUsers'): userSet.add(u) | ||
return { | ||
"githubRepos": list(repoSet), | ||
"githubOrgs": list(orgSet), | ||
"githubUsers": list(userSet), | ||
} | ||
|
||
def getGitHubData(typeOrIds: List)-> GitHubData: | ||
"""_summary_ | ||
Args: | ||
typeOrIds (List<str>): _description_ | ||
Returns: | ||
GitHubData: _description_ | ||
""" | ||
if len(typeOrIds) == 0: return emptyData | ||
data = getLabelData() | ||
if data == None: return emptyData | ||
arr = list(filter(lambda i: i.get('type') in typeOrIds or i.get('identifier') in typeOrIds, data)) | ||
return labelDataToGitHubData(arr) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
class Metric(object): | ||
from metrics.chaoss import Chaoss as chaoss | ||
from metrics.index import Index as index | ||
from metrics.related_users import Relation as relation |
Oops, something went wrong.