diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..f25caf6 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,103 @@ +name: Task1 QC +on: + push: + branches: + - main + +jobs: + process_raw: + runs-on: self-hosted + outputs: + sub: ${{ steps.set_vars.outputs.sub }} + task: ${{ steps.set_vars.outputs.task }} + version: ${{ steps.set_vars.outputs.version }} + + steps: + - name: checkout code and return recently uploaded file in /data + uses: actions/checkout@v3 + - name: Get changed files + run: | + #!/bin/bash + + # Get the list of CSV files changed in the last 24 hours + data=$(git log --since="24 hours ago" --name-only --pretty=format: -- '*.csv' | sort | uniq) + + # Export the data variable to the environment + echo "data=$data" >> $GITHUB_ENV + + # Print the changed CSV files + echo "Changed CSV files in the last 24 hours: $data" + + - name: set up python + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: parse raw + id: set_vars + run: | + # Loop through each CSV file in $data + for file in $data; do + filename=$(basename "$file") + IFS='_' read -r sub task version <<< "$filename" + version="${version%.csv}" # Remove the .csv extension from version + echo "::set-output name=sub::$sub" + echo "::set-output name=task::$task" + echo "::set-output name=version::$version" + echo "Subject: $sub" + echo "Task: $task" + echo "Version: $version" + done + + run_qc: + runs-on: self-hosted + needs: process_raw + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Debug env vars + run: | + echo "sub=${{ needs.process_raw.outputs.sub }}" + echo "task=${{ needs.process_raw.outputs.task }}" + echo "version=${{ needs.process_raw.outputs.version }}" + + - name: run quality control + run: | + sub=${{ needs.process_raw.outputs.sub }} + task=${{ needs.process_raw.outputs.task }} + vers=${{ needs.process_raw.outputs.version }} + for sub in ${sub}; do + echo "Processing subject: $sub" + for task in ${task}; do + echo "Processing task: $task" + for vers in ${vers}; do + echo "Processing version: $vers" + csv_file="./data/${sub}/processed/${sub}_${task}_${vers}.csv" + log_file="./data/${sub}/qc_${task}_${vers}.log" + echo "CSV file: $csv_file" + echo "Log file: $log_file" + if [ -f "$csv_file" ]; then + python ./code/SMqC.py -s "$csv_file" -o "./data/${sub}/" -sub "$sub" | tee "$log_file" + echo "QC for ${sub}_${task}_${vers} running" + else + echo "CSV file $csv_file does not exist" + fi + done + done + done + + push: + runs-on: self-hosted + needs: run_qc + steps: + - name: Commit and Push Changes + run: | + git config --global user.name "miloswrath" + git config --global user.email "miloswrath@users.noreply.github.com" + git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/$GITHUB_REPOSITORY + git add . + git commit -m "Automated commit by GitHub Actions" + git push + env: + GITHUB_TOKEN: ${{ secrets.GIT_PAT }} diff --git a/code/ConvertBeh.py b/code/ConvertBeh.py new file mode 100644 index 0000000..ff83c6a --- /dev/null +++ b/code/ConvertBeh.py @@ -0,0 +1,71 @@ +# %% +import pandas as pd +import numpy as np +import os +import json + +def parse_args(): + import argparse + parser = argparse.ArgumentParser(description='Convert Behavior Data') + parser.add_argument('-submission', type=str, help='Path to the submission file') + parser.add_argument('-out', type=str, help='Output directory') + parser.add_argument('-sub', type=str, help='Subject ID') + parser.add_argument('-task', type=str, help='Task name') + parser.add_argument('-taskvers', type=str, help='Task version') + return parser.parse_args() + + +def rename_files(submission, subject, task, taskvers): + #rename raw text files to be subject_task_taskvers.txt + for root, dirs, files in os.walk(submission): + for file in files: + if file.endswith(".txt"): + os.rename(os.path.join(root, file), os.path.join(root, f"{subject}_{task}_{taskvers}.txt")) + submission = os.path.join(root, 'raw', f"{subject}_{task}_{taskvers}.txt") + print(submission) + return submission + + + + +def convert_beh(submission, out): + + if not os.path.isfile(submission): + print(f"file does not exist: {submission}") + + # Use list_txt to store one file since I don't want to screw with Marco's code + + + count = 0 + dic = {} + + count += 1 + tweets = [] + with open(submission, 'r') as file: + for line in file: + tweets.append(json.loads(line)) + dic[count]= pd.json_normalize(tweets,'data') + + print(dic) + + + paths = [] + for i in range(len(dic)): + i += 1 + for sub in np.unique(dic[i]['subject_id']): + print(sub) + paths.append((out+"/{0}_{1}_{2}"+".csv").format(sub,dic[i]['task'][0],dic[i]['task_vers'][0])) + + for path in paths: + dic[i].to_csv(path, index=False) + print(f"saved {path}") + +def main(): + args = parse_args() + submissive = rename_files(args.submission, args.sub, args.task, args.taskvers) + convert_beh(submissive, args.out) + +if __name__ == "__main__": + main() + + diff --git a/jatosAPI.py b/jatosAPI.py new file mode 100644 index 0000000..e5b2c7f --- /dev/null +++ b/jatosAPI.py @@ -0,0 +1,225 @@ +import requests +from datetime import datetime, timedelta +import zipfile +import os +import numpy as np +import pandas as pd +import json +import shutil +import subprocess + +# jap_5ThOJ14yf7z1EPEUpAoZYMWoETZcmJk305719 + +def get_met(): + + + url = 'https://jatos.psychology.uiowa.edu/jatos/api/v1/results/metadata' + headers = { + 'accept': 'application/json', + 'Authorization': 'Bearer jap_5ThOJ14yf7z1EPEUpAoZYMWoETZcmJk305719', + 'Content-Type': 'application/json', + } + data = { + 'studyIds': [955, 971, 994, 917, 927, 943] + } + + response = requests.post(url, headers=headers, json=data) + + # If you want to print the response + print(response.status_code) + print(response.json()) + response_json = response.json() + + response = response_json + + # Get the current timestamp + current_time = datetime.now().timestamp() * 1000 # Convert to milliseconds + one_day_ago = current_time - (24 * 60 * 60 * 1000) # 24 hours ago in milliseconds + + # Initialize an empty list to store study result IDs + study_result_ids = [] + + # Iterate through the data to check conditions and collect study result IDs + for study in response['data']: + for study_result in study['studyResults']: + if study_result['studyState'] == 'FINISHED' and study_result['endDate'] >= one_day_ago: + study_result_ids.append(study_result['id']) + break # No need to check other component results for this study result + + # Print the list of study result IDs + print(study_result_ids) + + if len(study_result_ids) == 0: + print("No study results found.") + exit() + + return study_result_ids + +def get_data(study_result_ids): + headers = { + 'accept': 'application/octet-stream', + 'Authorization': 'Bearer jap_5ThOJ14yf7z1EPEUpAoZYMWoETZcmJk305719', + 'Content-Type': 'application/json', + } + # Get the data for each study result + datas = { + 'studyIds': [955, 971, 994, 917, 927, 943], + 'studyResultIds': study_result_ids + } + + url = 'https://jatos.psychology.uiowa.edu/jatos/api/v1/results/data' + response = requests.post(url, headers=headers, json=datas) + # Debugging information + print(f"Status Code: {response.status_code}") + + + # Save the unzip file and save .txt file to the current directory + if response.status_code == 200: + jrzip_file = 'response.jrzip' + with open(jrzip_file, 'wb') as f: + f.write(response.content) + print(f"Downloaded file: {jrzip_file}") + + # Verify if the file is a valid zip file + if zipfile.is_zipfile(jrzip_file): + print("The file is a valid zip file.") + + # Create a new zip file with only the desired files + filtered_jrzip_file = 'filtered_response.jrzip' + with zipfile.ZipFile(jrzip_file, 'r') as zip_ref: + with zipfile.ZipFile(filtered_jrzip_file, 'w') as filtered_zip_ref: + for zip_info in zip_ref.infolist(): + # Check if the filename contains any of the study_result_ids + if any(str(study_result_id) in zip_info.filename for study_result_id in study_result_ids): + filtered_zip_ref.writestr(zip_info, zip_ref.read(zip_info.filename)) + print(f"Filtered zip file created: {filtered_jrzip_file}") + + # Extract the filtered zip file + with zipfile.ZipFile(filtered_jrzip_file, 'r') as zip_ref: + zip_ref.extractall('./data/raw') + print(f"Unzipped file: {filtered_jrzip_file}") + + # Optionally, remove the original and filtered zip files after extraction + os.remove(jrzip_file) + os.remove(filtered_jrzip_file) + + # Walk through the directory and find all .txt files, save paths to a list + txt_files = [] + for root, dirs, files in os.walk("./data/raw"): + for file in files: + if file.endswith(".txt"): + txt_files.append(os.path.join(root, file)) + print(f"Found {len(txt_files)} .txt files.") + #move the text file to the data folder + + else: + print("The file is not a valid zip file.") + else: + print("Failed to retrieve or save the file.") + print(f"Response Text: {response.text}") + + return txt_files + +def convert_beh(): + + + txt = [] + for root, dirs, files in os.walk('./data/raw'): + for file in files: + if file.endswith(".txt"): + txt.append(os.path.join(root, file)) + print(txt) + + count = 0 + dic = {} + for b in txt: + count += 1 + tweets = [] + with open(b, 'r') as file: + for line in file: + tweets.append(json.loads(line)) + dic[count]= pd.json_normalize(tweets,'data') + + print(dic) + + + + paths = [] + for i in range(len(dic)): + i += 1 + for sub in np.unique(dic[i]['subject_id']): + print(sub) + paths.append((f'./data/{sub}/processed'+"/{0}_{1}_{2}"+".csv").format(sub,dic[i]['task'][0],dic[i]['task_vers'][0])) + + + + for path in paths: + os.makedirs(os.path.dirname(path), exist_ok=True) + dic[i].to_csv(path, index=False) + print(f"saved {path}") + + + + return paths + + +def move_txt(txt_files): + dic = {} + for file_path in txt_files: + tweets = [] + with open(file_path, 'r') as file: + # Read text file and append each line as a JSON object to tweets + for line in file: + tweets.append(json.loads(line)) + dic[file_path] = pd.json_normalize(tweets, 'data') + + for file_path, df in dic.items(): + for sub in np.unique(df['subject_id']): + print(sub) + target_dir = f'./data/{sub}/raw' + os.makedirs(target_dir, exist_ok=True) + # Save the DataFrame to a CSV file in the target directory + output_file = os.path.join(target_dir, os.path.basename(file_path)) + # save df as a txt file to target dir + with open(output_file, 'w') as f: + f.write(df.to_string(index=False)) + print(f"Saved {output_file} to {target_dir}") + os.remove(file_path) + print(f"Removed {file_path}") + # remove any dirs in data/raw + for root, dirs, files in os.walk('./data/raw'): + for d in dirs: + shutil.rmtree(os.path.join(root, d)) + + return None + + + + +def push(): + + subprocess.run(["git", "add", "-A"]) + subprocess.run(["git", "commit", "-m", "auto commit -> added subject task data"]) + subprocess.run(["git", "push"]) + + +def main(): + study_result_ids = get_met() + get_data(study_result_ids) + convert_beh() + txt_files = [] + for root, dirs, files in os.walk('./data/raw'): + for file in files: + if file.endswith(".txt"): + txt_files.append(os.path.join(root, file)) + move_txt(txt_files) + push() + + + + + + +if __name__ == "__main__": + main() + \ No newline at end of file