Skip to content

Commit

Permalink
add statistical analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
DonaldLamNL committed Jan 1, 2024
1 parent 75b0b7a commit e4e015e
Show file tree
Hide file tree
Showing 51 changed files with 459 additions and 946 deletions.
2 changes: 2 additions & 0 deletions Analysis/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
## To-Do
Write the README.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
99 changes: 55 additions & 44 deletions visualization/main.ipynb → Analysis/main.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
"source": [
"import json\n",
"import pandas as pd\n",
"from sklearn.cluster import DBSCAN\n",
"\n",
"from visualize import *\n",
"from utils import *\n",
"\n",
"my_dict = {'t1': 'T1','t2': 'T2','t3': 'T3','t4': 'T4','t5': 'T5',\n",
" 'v1': 'V1','v2': 'V2','v3': 'V3','v4': 'V4','v5': 'V5',\n",
Expand Down Expand Up @@ -44,7 +44,8 @@
"outputs": [],
"source": [
"basis = extract_basis('basis/full.json')\n",
"vis = Visualize('BFI', basis)\n"
"vis = Visualize('BFI', basis)\n",
"data, info = extract_data('save/save.json')\n"
]
},
{
Expand All @@ -54,8 +55,6 @@
"outputs": [],
"source": [
"# Plot Prompt Sensitivity Results\n",
"data, info = extract_data('save/save.json')\n",
"\n",
"for aspect in info:\n",
" for index, value in enumerate(data[aspect].unique()):\n",
" vis.add(data[data[aspect] == value], my_colors[index], my_dict[value])\n",
Expand All @@ -66,49 +65,16 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of Inliers: 2439\n",
"Number of Outliers: 61\n",
"2500\n",
"Saved \"figures/outliers-0\".\n"
]
},
{
"data": {
"text/plain": [
"<Figure size 640x480 with 0 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"outputs": [],
"source": [
"# Plot Outliers\n",
"def detect_outlier(df, eps, min_samples):\n",
" pca_data = vis.pca_transform(df)\n",
" dbscan = DBSCAN(eps=eps, min_samples=min_samples)\n",
" labels = dbscan.fit_predict(pca_data)\n",
" print(f\"Number of Inliers: {len(labels[labels != -1])}\")\n",
" print(f\"Number of Outliers: {len(labels[labels == -1])}\")\n",
" print(f\"{len(labels[labels != -1]) + len(labels[labels == -1])}\")\n",
" df['Label'] = labels\n",
" return df\n",
"\n",
"test_cases = [(0.302, 20)]\n",
"for i, (eps, min_samples) in enumerate(test_cases):\n",
" data_outliers = detect_outlier(data, eps, min_samples)\n",
" vis.add(data_outliers[data[\"Label\"] == -1], my_colors[0], 'Outliers')\n",
" vis.add(data_outliers[data[\"Label\"] != -1], my_colors[1], 'Inliers')\n",
" vis.plot(f'outliers-{i}')\n",
" vis.clean()\n",
" "
"data_outliers = vis.detect_outlier(data, 0.302, 20)\n",
"vis.add(data_outliers[data[\"Label\"] == -1], my_colors[0], 'Outliers')\n",
"vis.add(data_outliers[data[\"Label\"] != -1], my_colors[1], 'Inliers')\n",
"vis.plot(f'outliers')\n",
"vis.clean()\n"
]
},
{
Expand Down Expand Up @@ -206,6 +172,51 @@
" vis.clean()\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Statistical Analysis\n",
"filename = 'comp.md'\n",
"questionnaire = get_questionnaire('BFI')\n",
"categories = list(questionnaire['categories'].keys())\n",
"\n",
"# data, info = extract_data('save/save.json')\n",
"\n",
"analysis_data = data\n",
"\n",
"# Remove outliers\n",
"analysis_data = vis.detect_outlier(data, 0.302, 20)[data[\"Label\"] != -1]\n",
"print(analysis_data.shape)\n",
"\n",
"with open(filename, 'w') as output_file:\n",
" for aspect in info:\n",
" write_df = pd.DataFrame(columns=categories)\n",
" \n",
" for index, value in enumerate(data[aspect].unique()):\n",
" records = list()\n",
" x = data[data[aspect] == value]\n",
" y = data[data[aspect] != value]\n",
" \n",
" for cat in categories:\n",
" sym, diff, _, p_val = hypothesis_testing(x[cat].tolist(), y[cat].tolist())\n",
" \n",
" \"==================== Output Template ====================\"\n",
" records.append(f'${sym} {diff:.2f}_{{{p_val:.2f}}}$')\n",
" \"=======================================================\"\n",
" \n",
" write_df.loc[my_dict[value]] = records\n",
"\n",
" output_file.write(f'### {aspect.capitalize()}\\n')\n",
" \"==================== Output Format ====================\"\n",
" output_file.write(write_df.to_markdown())\n",
" # output_file.write(write_df.to_latex(escape=False))\n",
" \"=======================================================\"\n",
" output_file.write('\\n\\n')\n"
]
}
],
"metadata": {
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
95 changes: 95 additions & 0 deletions Analysis/tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""
Author: LAM Man Ho ([email protected])
"""

import json
import numpy as np
import pandas as pd
import scipy.stats as stats

from itertools import product

'''
Get corresponding questionnaire
name (str)
'''
def get_questionnaire(name):
try:
with open('dataset/questionnaires.json') as dataset:
data = json.load(dataset)
try:
return data[name]
except ValueError: raise ValueError("Questionnaire not found.")
except FileNotFoundError: raise FileNotFoundError("The 'questionnaires.json' file does not exist.")

'''
Construct the basis and fit to PCA to extract the projection matrix for dimensional reduction
'''
def construct_basis(questionnaire_name, savefile, mode='full'):
basis = list()
questionnaire = get_questionnaire(questionnaire_name)
scales = questionnaire["scales"]
categories = list(questionnaire["categories"].keys())
combinations = list(product(scales, repeat=len(categories)))
for item in combinations:
basis.append(dict(zip(categories, item)))
with open(savefile, 'w') as f:
json.dump(basis, f, indent=4)

'''
Extract the fitting basis
'''
def extract_basis(filename):
with open(filename, 'r') as f:
basis = json.load(f)
df = pd.DataFrame(basis)
return df

'''
Extract the save data as DataFrame
'''
def extract_data(filename):
try:
with open(filename, 'r') as f:
data = json.load(f)
except FileNotFoundError:
raise FileExistsError

info = tuple(data["data"][0]["info"].keys())
data = [{**d["info"], **d["data"]} for d in data["data"]]
df = pd.DataFrame(data)
return df, info


'''
Conduct hypothesis testing
x, y (list)
'''
def hypothesis_testing(x, y, significant_level=0.001):
mean1, std1, n1 = np.mean(x), np.std(x), len(x)
mean2, std2, n2 = np.mean(y), np.std(y), len(y)

# Add an epsilon to prevent the zero standard deviarion
epsilon = 1e-8
std1 += epsilon
std2 += epsilon

# Perform F-test
if std1 > std2:
f_value = std1 ** 2 / std2 ** 2
df1, df2 = n1 - 1, n2 - 1
else:
f_value = std2 ** 2 / std1 ** 2
df1, df2 = n2 - 1, n1 - 1

p_value = (1 - stats.f.cdf(f_value, df1, df2)) * 2
equal_var = True if p_value > significant_level else False

# Performing T-test
df = n1 + n2 - 2 if equal_var else ((std1**2 / n1 + std2**2 / n2)**2) / ((std1**2 / n1)**2 / (n1 - 1) + (std2**2 / n2)**2 / (n2 - 1))
t_value, p_value = stats.ttest_ind_from_stats(mean1, std1, n1, mean2, std2, n2, equal_var=equal_var)

diff = mean1 - mean2
symbol = '=' if p_value > significant_level else '>' if t_value > 0 else '<'

return symbol, diff, t_value, p_value
66 changes: 13 additions & 53 deletions visualization/visualize.py → Analysis/visualize.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
"""
Author: LAM Man Ho ([email protected])
"""

import os
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import product

from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from Analysis.tools import *

class Visualize:
'''
Expand Down Expand Up @@ -86,54 +86,14 @@ def plot(self, savename=None, random_zorder=False, exclude=[]):
'''
def clean(self):
self.data = list()

'''
Get corresponding questionnaire
name (str)
'''
def get_questionnaire(name):
try:
with open('dataset/questionnaires.json') as dataset:
data = json.load(dataset)
try:
return data[name]
except ValueError: raise ValueError("Questionnaire not found.")
except FileNotFoundError: raise FileNotFoundError("The 'questionnaires.json' file does not exist.")

'''
Construct the basis and fit to PCA to extract the projection matrix for dimensional reduction
'''
def construct_basis(questionnaire_name, savefile, mode='full'):
basis = list()
questionnaire = get_questionnaire(questionnaire_name)
scales = questionnaire["scales"]
categories = list(questionnaire["categories"].keys())
combinations = list(product(scales, repeat=len(categories)))
for item in combinations:
basis.append(dict(zip(categories, item)))
with open(savefile, 'w') as f:
json.dump(basis, f, indent=4)

'''
Extract the fitting basis
'''
def extract_basis(filename):
with open(filename, 'r') as f:
basis = json.load(f)
df = pd.DataFrame(basis)
return df

'''
Extract the save data as DataFrame
'''
def extract_data(filename):
try:
with open(filename, 'r') as f:
data = json.load(f)
except FileNotFoundError:
raise FileExistsError

info = tuple(data["data"][0]["info"].keys())
data = [{**d["info"], **d["data"]} for d in data["data"]]
df = pd.DataFrame(data)
return df, info
def detect_outlier(self, df, eps, min_samples):
pca_data = self.pca_transform(df)
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(pca_data)
print(f"Number of Inliers: {len(labels[labels != -1])}")
print(f"Number of Outliers: {len(labels[labels == -1])}")
print(f"{len(labels[labels != -1]) + len(labels[labels == -1])}")
df['Label'] = labels
return df
Loading

0 comments on commit e4e015e

Please sign in to comment.