使用随机森林、xgboost、LSTM模型,使用CHAOSS统计型数据指标来拟合OpenRank值
可用的数据格式分为两种 一种为一个字典,字典的key为时间,value为数值。例如
{"2020-08":4,"2020-09":4,"2020-10":1,"2020-11":2,"2020-12":10,"2021-01":4,"2021-02":3,"2021-03":2,"2021-04":2,"2021-06":3,"2021-07":1,"2021-08":1,"2021-09":1,"2021-11":4,"2022-03":1,"2022-04":1,"2022-05":2,"2022-06":3,"2022-07":3,"2022-08":2,"2022-09":3,"2022-10":9,"2022-11":3,"2022-12":2,"2023-01":2,"2023-02":4,"2023-03":4,"2023-04":5,"2023-05":3,"2023-06":2}
这种处理比较简单
def process_1(directory,project,file):
if not os.path.exists(os.path.join(directory+project, file)):
return pd.DataFrame()
with open(os.path.join(directory+project, file), 'r') as f:
data = json.load(f)
if len(data)==0:
return pd.DataFrame()
temp_df = pd.DataFrame.from_dict(data, orient='index')
temp_df.columns = [file[:-5]]
return temp_df
另外一种例如
{"avg":{"2020-10":7.75,"2020-11":24.4,"2020-12":14.91,"2021-01":19,"2021-02":23.77,"2021-03":62.88,"2021-04":143.75,"2021-05":144.8,"2021-06":117.43,"2021-07":138.29,"2021-08":114.09,"2021-09":109,"2021-10":140,"2021-11":355.75,"2021-12":262.67,"2022-01":335.4,"2022-02":303.83,"2022-03":290.29,"2022-04":281,"2022-05":307.5,"2022-06":248.64,"2022-07":240.69,"2022-08":223.19,"2022-09":265.53,"2022-10":295.8,"2022-11":305.56,"2022-12":315.71,"2023-01":364.56,"2023-02":333.37,"2023-03":330,"2023-04":358.52,"2023-05":388.05,"2023-06":418.05},"levels":{"2020-10":[3,1,0,0],"2020-11":[2,0,3,0],"2020-12":[19,0,0,3],"2021-01":[21,0,1,3],"2021-02":[34,38,17,4],"2021-03":[5,0,0,3],"2021-04":[1,0,0,3],"2021-05":[0,1,1,3],"2021-06":[3,0,1,3],"2021-07":[2,1,0,4],"2021-08":[2,2,3,4],"2021-09":[2,2,4,7],"2021-10":[0,0,4,11],"2021-11":[0,0,0,4],"2021-12":[1,1,0,4],"2022-01":[1,0,0,4],"2022-02":[1,0,1,4],"2022-03":[0,1,1,5],"2022-04":[1,0,1,6],"2022-05":[1,0,0,7],"2022-06":[2,1,1,7],"2022-07":[0,2,3,8],"2022-08":[2,1,2,11],"2022-09":[1,0,1,13],"2022-10":[0,1,0,14],"2022-11":[1,0,1,14],"2022-12":[2,0,0,15],"2023-01":[0,1,0,15],"2023-02":[1,2,1,15],"2023-03":[2,0,3,16],"2023-04":[1,0,1,19],"2023-05":[1,0,0,20],"2023-06":[0,0,1,20]},"quantile_0":{"2020-10":1,"2020-11":1,"2020-12":1,"2021-01":1,"2021-02":4,"2021-03":2,"2021-04":2,"2021-05":25,"2021-06":1,"2021-07":4,"2021-08":7,"2021-09":2,"2021-10":33,"2021-11":208,"2021-12":4,"2022-01":6,"2022-02":6,"2022-03":23,"2022-04":6,"2022-05":1,"2022-06":5,"2022-07":25,"2022-08":2,"2022-09":2,"2022-10":22,"2022-11":2,"2022-12":3,"2023-01":16,"2023-02":7,"2023-03":3,"2023-04":3,"2023-05":3,"2023-06":33},"quantile_1":{"2020-10":2.5,"2020-11":1,"2020-12":4.25,"2021-01":4,"2021-02":12,"2021-03":3.75,"2021-04":138.5,"2021-05":33,"2021-06":5,"2021-07":14.5,"2021-08":24.5,"2021-09":30,"2021-10":61,"2021-11":350.5,"2021-12":78.5,"2022-01":270,"2022-02":100,"2022-03":51,"2022-04":63.5,"2022-05":94.5,"2022-06":25.5,"2022-07":41,"2022-08":58.25,"2022-09":93,"2022-10":124,"2022-11":149.25,"2022-12":178,"2023-01":211.25,"2023-02":176,"2023-03":75,"2023-04":105,"2023-05":136,"2023-06":166},"quantile_2":{"2020-10":6.5,"2020-11":33,"2020-12":6.5,"2021-01":7,"2021-02":20,"2021-03":5.5,"2021-04":187.5,"2021-05":215,"2021-06":55,"2021-07":86,"2021-08":42,"2021-09":58,"2021-10":89,"2021-11":401.5,"2021-12":334,"2022-01":460,"2022-02":393,"2022-03":329,"2022-04":227,"2022-05":258,"2022-06":128,"2022-07":145,"2022-08":87.5,"2022-09":123,"2022-10":154,"2022-11":178.5,"2022-12":204,"2023-01":240.5,"2023-02":253,"2023-03":279,"2023-04":309,"2023-05":340,"2023-06":370},"quantile_3":{"2020-10":11.75,"2020-11":40,"2020-12":10.5,"2021-01":11,"2021-02":27,"2021-03":155.75,"2021-04":192.75,"2021-05":222,"2021-06":248.5,"2021-07":279.5,"2021-08":212,"2021-09":113,"2021-10":144,"2021-11":406.75,"2021-12":434.25,"2022-01":467,"2022-02":493.25,"2022-03":522.5,"2022-04":550.75,"2022-05":581.75,"2022-06":515,"2022-07":451,"2022-08":284,"2022-09":380,"2022-10":411,"2022-11":375,"2022-12":340,"2023-01":437,"2023-02":385,"2023-03":402,"2023-04":432,"2023-05":463,"2023-06":493},"quantile_4":{"2020-10":17,"2020-11":47,"2020-12":78,"2021-01":109,"2021-02":137,"2021-03":168,"2021-04":198,"2021-05":229,"2021-06":259,"2021-07":290,"2021-08":321,"2021-09":351,"2021-10":382,"2021-11":412,"2021-12":443,"2022-01":474,"2022-02":502,"2022-03":533,"2022-04":563,"2022-05":594,"2022-06":624,"2022-07":655,"2022-08":686,"2022-09":716,"2022-10":747,"2022-11":777,"2022-12":808,"2023-01":839,"2023-02":867,"2023-03":898,"2023-04":928,"2023-05":959,"2023-06":989}}
字典的key中包括"avg","levels","quantile_0","quantile_1","quantile_2","quantile_3"这些属性。注意到"levels"的value是一个只会有4个元素的list,所以我们将它处理为"levels_0","levels_1","levels_2","levels_3"。代码如下
def process_2(directory,project,file):
df=pd.DataFrame()
if not os.path.exists(os.path.join(directory+project, file)):
return pd.DataFrame()
with open(os.path.join(directory+project, file), 'r') as f:
data = json.load(f)
for key in data:
if len(data[key])==0:
continue
temp_df = pd.DataFrame.from_dict(data[key], orient='index')
if key=="levels":
temp_df.columns = [str(file[:-5])+"_"+key+"_0",str(file[:-5])+"_"+key+"_1",str(file[:-5])+"_"+key+"_2",str(file[:-5])+"_"+key+"_3"]
else:
temp_df.columns = [str(file[:-5])+"_"+key]
if df.empty:
df = temp_df
else:
df = df.join(temp_df, how='outer')
return df
我们在拟合openrank值时,使用2023年1,2,3月数据作为测试集,以预测值与真实值的mse作为指标
测试集mse:9800 特征重要性排行:(这里只放出排行靠前的)
issues_and_change_request_active: 0.72565
bus_factor: 0.10606
technical_fork: 0.07172
change_request_resolution_duration_levels_3: 0.03049
change_requests_reviews: 0.01791
inactive_contributors: 0.00627
issue_age_quantile_4: 0.00371
issue_age_levels_2: 0.00344
change_requests: 0.00332
change_request_age_quantile_1: 0.00299
issue_age_levels_3: 0.00280
issue_resolution_duration_levels_1: 0.00274
issue_resolution_duration_levels_0: 0.00262
change_request_age_levels_3: 0.00237
issues_closed: 0.00188
change_request_resolution_duration_avg: 0.00181
issue_response_time_levels_0: 0.00129
change_request_resolution_duration_levels_2: 0.00112
测试集mse:4723
测试集mse:1100