From 24ea564805c235d371388f63711e77ec44113605 Mon Sep 17 00:00:00 2001
From: Rafael Mota <rma@cesar.org.br>
Date: Mon, 23 Dec 2019 17:25:36 -0300
Subject: [PATCH 1/6] Refactor fetch_jars script

---
 scripts/fetch_jars.py | 220 +++++++++++++++++++++++++-----------------
 1 file changed, 130 insertions(+), 90 deletions(-)

diff --git a/scripts/fetch_jars.py b/scripts/fetch_jars.py
index 074d093c7..c8bf4c7a9 100644
--- a/scripts/fetch_jars.py
+++ b/scripts/fetch_jars.py
@@ -4,106 +4,128 @@
 import subprocess
 import time
 import shutil
-import os
+from os import path
+from os import listdir
+import csv
 
 PATH = "path"
 NAME = "name"
+BRANCH = "branch"
+STATE = "state"
+
+MERGE_COMMIT = "merge commit"
 RESULT = "result"
 GITHUB_API= "https://api.github.com"
 TRAVIS_API = "https://api.travis-ci.org"
 LOGIN = "login"
-DOWNLOAD_URL='browser_download_url'
+BROWSER_DOWNLOAD_URL='browser_download_url'
 ASSETS="assets"
 MESSAGE_PREFIX="Trigger build #"
 RELEASE_PREFIX= "fetchjar-"
+FINISHED = "finished"
+UNTAGGED = "untagged"
 
-inputPath = sys.argv[1] # input path passed as cli argument
-outputPath = sys.argv[2] # output path passed as cli argument
+input_path = sys.argv[1] # input path passed as cli argument
+output_path = sys.argv[2].rstrip("/") # output path passed as cli argument
 token = sys.argv[3] # token passed as cli argument
 
-def fetchJars(inputPath, outputPath, token):
+def fetch_jars(input_path, output_path, token):
     # this method reads a csv input file, with the projects name and path
     # for each project it downloads the build generated via github releases
     # and moves the builds to the output generated by the framework
-    
     print("Starting build collection")
 
-    tokenUser = get_github_user(token)[LOGIN]
+    token_user_name = get_github_user(token)[LOGIN]
 
-    parsedInput = read_input(inputPath)
-    parsedOutput = read_output(outputPath)
-    newResultsFile = []
+    parsed_input = read_csv(input_path, ",")
+    parsed_output = read_csv(output_path + "/data/results.csv", ";")
+    
+    parsed_output_hash = output_to_hash(parsed_output)
 
-    for project in parsedInput:
+    new_results_file = []
 
-        splitedProjectPath = project[PATH].split('/')
-        projectName = splitedProjectPath[len(splitedProjectPath) - 1]
-        githubProject = tokenUser + '/' + projectName
-        print (projectName)        
+    for project in parsed_input:
 
-        get_builds_and_wait(githubProject)
+        splited_project_path = project[PATH].split('/')
+        project_name = splited_project_path[len(splited_project_path) - 1]
+        github_project = token_user_name + '/' + project_name
+        print (project_name)        
 
-        releases = get_github_releases(token, githubProject)
+        get_builds_and_wait(github_project)
 
+        releases = get_github_releases(token, github_project)
         # download the releases for the project moving them to the output directories
         for release in releases:
             # check if release was generated by the framework
             if (release[NAME].startswith(RELEASE_PREFIX)):
-                commitSHA = release[NAME].replace(RELEASE_PREFIX, '')
-                print ("Downloading " + commitSHA )
+                commit_sha = strip_release_prefix(release)
+
+                # preparing and downloading build
+                print ("Scenario: " + commit_sha)
+
                 try:
-                    downloadPath = mount_download_path(outputPath, project, commitSHA)
-                    downloadUrl = release[ASSETS][0][DOWNLOAD_URL]
-                    download_file(downloadUrl, downloadPath)
-                    if (commitSHA in parsedOutput):
-                        newResultsFile.append(parsedOutput[commitSHA])
-                        untar_and_remove_file(downloadPath)
-                    print (downloadPath + ' is ready')
-                except:
-                    pass
+                    download_build(output_path, project, commit_sha, release)
+                        
+                    new_results_file.append(parsed_output_hash[commit_sha])
     
-        remove_commit_files_without_builds (outputPath, projectName)
+                    print ("Scenario is ready")
+                except Exception as e:
+                    print ("Error downloading scenario: " + str(e))
+
+        remove_commit_files_without_builds (output_path, project_name)
       
-    with open(outputPath + "/data/results-with-builds.csv", 'w') as outputFile:
-        outputFile.write("project;merge commit;className;method;left modifications;left deletions;right modifications;right deletions\n")
-        outputFile.write("\n".join(newResultsFile))
-        outputFile.close()
-
-def read_output(outputPath):
-    fo = open(outputPath + "/data/results.csv")
-    file = fo.read()
-    fo.close()
-
-    fileOutLines = file.split("\n")
-    return parse_output(fileOutLines)
-
-def parse_output(lines):
-    result = {}
-    for line in lines[1:]:
-        cells = line.split(";")
-        if (len (cells) > 1):
-            result[cells[1]] = line
-    return result
-
-def read_input(inputPath):
-    f = open(inputPath, "r")
-    file = f.read()
-    f.close()
-
-    bruteLines = file.split("\n")
-    return parse_input(bruteLines)
-
-def parse_input(lines):
-    # parse framework input csv file 
-    result = []
-    for line in lines[1:]:
-        cells = line.split(",")
-        if (len (cells) > 1):
-            method = {}
-            method[NAME] = cells[0]
-            method[PATH] = cells[1]
-            result.append(method)
-    return result
+    save_results_with_builds(output_path, new_results_file)
+
+def download_build(output_path, project, commit_sha, release):
+    print ("Downloading")
+    scenario_path = mount_scenario_path(output_path, project, commit_sha)
+    
+    if path.exists(scenario_path):
+        tar_path = scenario_path + "result.tar.gz"
+        
+        build_path = scenario_path + "build"
+
+        if path.exists(tar_path):
+            raise Exception(tar_path + " already exists")
+
+        if path.exists(build_path):
+            raise Exception(build_path + " already exists")
+
+        download_url = get_download_url(release)
+
+        download_file(download_url, tar_path)
+
+        untar_and_remove_file(tar_path, build_path)
+    else:
+        raise Exception("Scenario folder: " + scenario_path + " doesn't exist")
+
+def output_to_hash(parsed_output):
+    parsed_output_hash = {}
+
+    for scenario in parsed_output:
+        parsed_output_hash[scenario[MERGE_COMMIT]] = scenario
+    
+    return parsed_output_hash
+
+def strip_release_prefix(release):
+    return release[NAME].replace(RELEASE_PREFIX, '')
+
+def get_download_url(release):
+    return release[ASSETS][0][BROWSER_DOWNLOAD_URL]
+
+def save_results_with_builds(output_path, new_results_file):
+    with open(output_path + "/data/results-with-builds.csv", 'w') as outputFile:
+        csv_writer = csv.DictWriter(outputFile, delimiter=";", 
+            fieldnames=["project","merge commit","className","method","left modifications","left deletions","right modifications","right deletions"])
+
+        csv_writer.writeheader()
+        for scenario in new_results_file:
+            csv_writer.writerow(scenario)
+
+
+def read_csv(file_path, delimiter):
+    with open(file_path, "r") as input_lines:
+        return list(csv.DictReader(input_lines, delimiter=delimiter))
 
 def download_file(url, target_path):
     # download file from url
@@ -111,28 +133,29 @@ def download_file(url, target_path):
     if response.status_code == 200:
         with open(target_path, 'wb') as f:
             f.write(response.raw.read())
+    else:
+        raise Exception("Download request returned status code: " + response.status_code)
 
-def mount_download_path(outputPath, project, commitSHA):
+def mount_scenario_path(output_path, project, commit_sha):
     # mount path where the downloaded build will be moved to
-    return outputPath + '/files/' + project[NAME] + '/' + commitSHA + '/result.tar.gz'
+    return output_path + '/files/' + project[NAME] + '/' + commit_sha + "/"
 
-def untar_and_remove_file(downloadPath): 
-    downloadDir = downloadPath.replace('result.tar.gz', '')
-    subprocess.call(['mkdir', downloadDir + 'build'])
-    subprocess.call(['tar', '-xf', downloadPath, '-C', downloadDir + '/build', ])
-    subprocess.call(['rm', downloadPath])
+def untar_and_remove_file(tar_path, output_path): 
+    subprocess.call(['mkdir', output_path])
+    subprocess.call(['tar', '-xf', tar_path, '-C', output_path ])
+    subprocess.call(['rm', tar_path])
 
 def get_builds_and_wait(project):
     has_pendent = True
     filtered_builds = []
     while (has_pendent):
         builds = get_travis_project_builds(project)
-        filtered_builds = filter (lambda x: not x["branch"].startswith("untagged"), builds)
+        filtered_builds = filter (lambda x: not x[BRANCH].startswith(UNTAGGED), builds)
         
         has_pendent = False
         for build in filtered_builds:
-            print (build["state"])
-            has_pendent = has_pendent or (build["state"] != "finished")
+            print (build[BRANCH] + " status: " + build[STATE] )
+            has_pendent = has_pendent or (build[STATE] != FINISHED)
     
         if (has_pendent):
             print ("Waiting 30 seconds")
@@ -142,34 +165,51 @@ def get_builds_and_wait(project):
 
 
 def get_travis_project_builds(project):
-    return requests.get(TRAVIS_API + '/repos/' + project + '/builds').json()
+    try: 
+        res = requests.get(TRAVIS_API + '/repos/' + project + '/builds')
+        res.raise_for_status()
 
+        return res.json()
+    except Exception as e:
+        raise Exception("Error getting travis builds: " + str(e))
+    
 def get_github_user(token):
-    return requests.get(GITHUB_API + '/user', headers=get_headers(token)).json()
+    try:
+        res = requests.get(GITHUB_API + '/user', headers=get_headers(token))
+        res.raise_for_status()
+
+        return res.json()
+    except Exception as e:
+        raise Exception("Error getting github user: " + str(e))
 
 def get_github_releases(token, project):
-    return requests.get(GITHUB_API + '/repos/' + project + '/releases', headers=get_headers(token)).json()
+    try:
+        res = requests.get(GITHUB_API + '/repos/' + project + '/releases', headers=get_headers(token))
+        res.raise_for_status()
+
+        return res.json()
+    except Exception as e:
+        raise Exception("Error getting github releases: " + str(e))
 
 def get_headers(token):
     return {
         "Authorization": "token " + token
     }
 
+def remove_commit_files_without_builds (output_path, project_name):
+    files_path = output_path + "/files/" + project_name +  "/"
 
-def remove_commit_files_without_builds (outputPath, projectName):
-    files_path = outputPath + "/files/" + projectName +  "/"
-
-    if (os.path.exists(files_path)): 
-        commit_dirs = os.listdir(files_path)
+    if (path.exists(files_path)): 
+        commit_dirs = listdir(files_path)
 
         for directory in commit_dirs:
             commit_dir = files_path + directory
             build_dir = commit_dir + "/build"
 
-            if (not os.path.exists(build_dir)):
+            if (not path.exists(build_dir)):
                 shutil.rmtree(commit_dir)
 
-        if (len (os.listdir(files_path)) == 0 ):
+        if (len (listdir(files_path)) == 0 ):
             shutil.rmtree(files_path)
 
-fetchJars(inputPath, outputPath, token)
\ No newline at end of file
+fetch_jars(input_path, output_path, token)
\ No newline at end of file

From c14279321c62b86f44a6249a8816813cf59e3497 Mon Sep 17 00:00:00 2001
From: Rafael Mota <rma@cesar.org.br>
Date: Mon, 23 Dec 2019 17:33:58 -0300
Subject: [PATCH 2/6] Add try catch in project level

---
 scripts/fetch_jars.py | 53 +++++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/scripts/fetch_jars.py b/scripts/fetch_jars.py
index c8bf4c7a9..a6e0f3b1d 100644
--- a/scripts/fetch_jars.py
+++ b/scripts/fetch_jars.py
@@ -4,8 +4,7 @@
 import subprocess
 import time
 import shutil
-from os import path
-from os import listdir
+from os import path, listdir
 import csv
 
 PATH = "path"
@@ -45,34 +44,37 @@ def fetch_jars(input_path, output_path, token):
     new_results_file = []
 
     for project in parsed_input:
-
         splited_project_path = project[PATH].split('/')
         project_name = splited_project_path[len(splited_project_path) - 1]
         github_project = token_user_name + '/' + project_name
-        print (project_name)        
+        
+        try:
+            print (project_name)        
 
-        get_builds_and_wait(github_project)
+            get_builds_and_wait(github_project)
 
-        releases = get_github_releases(token, github_project)
-        # download the releases for the project moving them to the output directories
-        for release in releases:
-            # check if release was generated by the framework
-            if (release[NAME].startswith(RELEASE_PREFIX)):
-                commit_sha = strip_release_prefix(release)
+            releases = get_github_releases(token, github_project)
+            # download the releases for the project moving them to the output directories
+            for release in releases:
+                # check if release was generated by the framework
+                if (release[NAME].startswith(RELEASE_PREFIX)):
+                    commit_sha = strip_release_prefix(release)
 
-                # preparing and downloading build
-                print ("Scenario: " + commit_sha)
+                    # preparing and downloading build
+                    print ("Scenario: " + commit_sha)
 
-                try:
-                    download_build(output_path, project, commit_sha, release)
-                        
-                    new_results_file.append(parsed_output_hash[commit_sha])
-    
-                    print ("Scenario is ready")
-                except Exception as e:
-                    print ("Error downloading scenario: " + str(e))
+                    try:
+                        download_build(output_path, project, commit_sha, release)
+                            
+                        new_results_file.append(parsed_output_hash[commit_sha])
+        
+                        print ("Scenario is ready")
+                    except Exception as e:
+                        print ("Error downloading scenario: " + str(e))
 
-        remove_commit_files_without_builds (output_path, project_name)
+            remove_commit_files_without_builds (output_path, project_name)
+        except Exception as e:
+            print ("Error fetching builds for project " + project_name + ": " + str(e))
       
     save_results_with_builds(output_path, new_results_file)
 
@@ -165,8 +167,9 @@ def get_builds_and_wait(project):
 
 
 def get_travis_project_builds(project):
+    res = requests.get(TRAVIS_API + '/repos/' + project + '/builds')
+
     try: 
-        res = requests.get(TRAVIS_API + '/repos/' + project + '/builds')
         res.raise_for_status()
 
         return res.json()
@@ -174,8 +177,8 @@ def get_travis_project_builds(project):
         raise Exception("Error getting travis builds: " + str(e))
     
 def get_github_user(token):
+    res = requests.get(GITHUB_API + '/user', headers=get_headers(token))
     try:
-        res = requests.get(GITHUB_API + '/user', headers=get_headers(token))
         res.raise_for_status()
 
         return res.json()
@@ -183,8 +186,8 @@ def get_github_user(token):
         raise Exception("Error getting github user: " + str(e))
 
 def get_github_releases(token, project):
+    res = requests.get(GITHUB_API + '/repos/' + project + '/releases', headers=get_headers(token))
     try:
-        res = requests.get(GITHUB_API + '/repos/' + project + '/releases', headers=get_headers(token))
         res.raise_for_status()
 
         return res.json()

From c01e2927ef54944755deb97354519f552f251aaf Mon Sep 17 00:00:00 2001
From: Rafael Mota <rma7@cin.ufpe.br>
Date: Mon, 23 Dec 2019 20:03:05 -0300
Subject: [PATCH 3/6] Add more error handling

---
 scripts/fetch_jars.py | 97 +++++++++++++++++++++++--------------------
 1 file changed, 52 insertions(+), 45 deletions(-)

diff --git a/scripts/fetch_jars.py b/scripts/fetch_jars.py
index a6e0f3b1d..b3012518d 100644
--- a/scripts/fetch_jars.py
+++ b/scripts/fetch_jars.py
@@ -34,53 +34,60 @@ def fetch_jars(input_path, output_path, token):
     # and moves the builds to the output generated by the framework
     print("Starting build collection")
 
-    token_user_name = get_github_user(token)[LOGIN]
-
-    parsed_input = read_csv(input_path, ",")
-    parsed_output = read_csv(output_path + "/data/results.csv", ";")
-    
-    parsed_output_hash = output_to_hash(parsed_output)
-
-    new_results_file = []
+    try:
+        token_user_name = get_github_user(token)[LOGIN]
 
-    for project in parsed_input:
-        splited_project_path = project[PATH].split('/')
-        project_name = splited_project_path[len(splited_project_path) - 1]
-        github_project = token_user_name + '/' + project_name
+        parsed_input = read_csv(input_path, ",")
+        parsed_output = read_csv(output_path + "/data/results.csv", ";")
         
-        try:
-            print (project_name)        
-
-            get_builds_and_wait(github_project)
-
-            releases = get_github_releases(token, github_project)
-            # download the releases for the project moving them to the output directories
-            for release in releases:
-                # check if release was generated by the framework
-                if (release[NAME].startswith(RELEASE_PREFIX)):
-                    commit_sha = strip_release_prefix(release)
-
-                    # preparing and downloading build
-                    print ("Scenario: " + commit_sha)
-
-                    try:
-                        download_build(output_path, project, commit_sha, release)
-                            
-                        new_results_file.append(parsed_output_hash[commit_sha])
+        parsed_output_hash = output_to_hash(parsed_output)
+
+        new_results_file = []
+
+        for project in parsed_input:
+            splited_project_path = project[PATH].split('/')
+            project_name = splited_project_path[len(splited_project_path) - 1]
+            github_project = token_user_name + '/' + project_name
+
+            # check if framework used optional custom name
+            if project[NAME]:
+                project_name = project[NAME]
+            
+            try:
+                print (project_name)        
+
+                get_builds_and_wait(github_project)
+
+                releases = get_github_releases(token, github_project)
+                # download the releases for the project moving them to the output directories
+                for release in releases:
+                    # check if release was generated by the framework
+                    if (release[NAME].startswith(RELEASE_PREFIX)):
+                        commit_sha = strip_release_prefix(release)
+
+                        # preparing and downloading build
+                        print ("Scenario: " + commit_sha)
+
+                        try:
+                            download_build(output_path, project, commit_sha, release)
+                                
+                            new_results_file.append(parsed_output_hash[commit_sha])
+            
+                            print ("Scenario is ready")
+                        except Exception as e:
+                            print ("Error downloading scenario: " + str(e))
+
+                remove_commit_files_without_builds (output_path, project_name)
+            except Exception as e:
+                print ("Error fetching builds for project " + project_name + ": " + str(e))
         
-                        print ("Scenario is ready")
-                    except Exception as e:
-                        print ("Error downloading scenario: " + str(e))
-
-            remove_commit_files_without_builds (output_path, project_name)
-        except Exception as e:
-            print ("Error fetching builds for project " + project_name + ": " + str(e))
-      
-    save_results_with_builds(output_path, new_results_file)
+        save_results_with_builds(output_path, new_results_file)
+    except Exception as e:
+        print (e)
 
-def download_build(output_path, project, commit_sha, release):
+def download_build(output_path, project_name, commit_sha, release):
     print ("Downloading")
-    scenario_path = mount_scenario_path(output_path, project, commit_sha)
+    scenario_path = mount_scenario_path(output_path, project_name, commit_sha)
     
     if path.exists(scenario_path):
         tar_path = scenario_path + "result.tar.gz"
@@ -110,7 +117,7 @@ def output_to_hash(parsed_output):
     return parsed_output_hash
 
 def strip_release_prefix(release):
-    return release[NAME].replace(RELEASE_PREFIX, '')
+    return release[NAME].replace(RELEASE_PREFIX, "")
 
 def get_download_url(release):
     return release[ASSETS][0][BROWSER_DOWNLOAD_URL]
@@ -138,9 +145,9 @@ def download_file(url, target_path):
     else:
         raise Exception("Download request returned status code: " + response.status_code)
 
-def mount_scenario_path(output_path, project, commit_sha):
+def mount_scenario_path(output_path, project_name, commit_sha):
     # mount path where the downloaded build will be moved to
-    return output_path + '/files/' + project[NAME] + '/' + commit_sha + "/"
+    return output_path + '/files/' + project_name + '/' + commit_sha + "/"
 
 def untar_and_remove_file(tar_path, output_path): 
     subprocess.call(['mkdir', output_path])

From 13c865ef86bdd70377f8cc749eb9d1a0a99aa674 Mon Sep 17 00:00:00 2001
From: Rafael Mota <rma7@cin.ufpe.br>
Date: Wed, 25 Dec 2019 20:52:16 -0300
Subject: [PATCH 4/6] Modify fetch_jars script to support same repository with
 differnt names projects

---
 scripts/fetch_jars.py | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/scripts/fetch_jars.py b/scripts/fetch_jars.py
index 12541aff3..40fb97340 100644
--- a/scripts/fetch_jars.py
+++ b/scripts/fetch_jars.py
@@ -12,6 +12,8 @@
 
 PATH = "path"
 NAME = "name"
+FORK_URL = "fork_url"
+
 BRANCH = "branch"
 STATE = "state"
 
@@ -43,19 +45,14 @@ def fetch_jars(input_path, output_path, token):
         parsed_input = read_csv(input_path, ",")
         parsed_output = read_csv(output_path + "/data/results.csv", ";")
         
+        projects = map(lambda p: process_project(p, token_user_name), parsed_input)
         parsed_output_hash = output_to_hash(parsed_output)
-        print (parsed_output_hash)
 
         new_results_file = []
 
-        for project in parsed_input:
-            splited_project_path = project[PATH].split('/')
-            project_name = splited_project_path[len(splited_project_path) - 1]
-            github_project = token_user_name + '/' + project_name
-
-            # check if framework used optional custom name
-            if project[NAME]:
-                project_name = project[NAME]
+        for project in projects:
+            github_project = project[FORK_URL]
+            project_name = project[NAME]
             
             try:
                 print (project_name)        
@@ -75,7 +72,7 @@ def fetch_jars(input_path, output_path, token):
                         try:
                             download_build(output_path, project_name, commit_sha, release)
                                 
-                            new_results_file.append(parsed_output_hash[commit_sha])
+                            new_results_file.append(parsed_output_hash[project_name + commit_sha])
             
                             print ("Scenario is ready")
                         except Exception as e:
@@ -89,6 +86,20 @@ def fetch_jars(input_path, output_path, token):
     except Exception as e:
         print (e)
 
+def process_project(data, token_user_name):
+    project = {}
+    
+    splited_project_path = data[PATH].split('/')
+    github_project_name = splited_project_path[len(splited_project_path) - 1]
+    github_project = token_user_name + '/' + github_project_name
+    # check if framework used optional custom name
+
+    project[PATH] = data[PATH]
+    project[FORK_URL] = github_project
+    project[NAME] = data[NAME] if data[NAME] else github_project_name
+
+    return project
+
 def download_build(output_path, project_name, commit_sha, release):
     print ("Downloading")
     scenario_path = mount_scenario_path(output_path, project_name, commit_sha)
@@ -116,7 +127,7 @@ def output_to_hash(parsed_output):
     parsed_output_hash = {}
 
     for scenario in parsed_output:
-        parsed_output_hash[scenario[scenario[MERGE_COMMIT]] = scenario
+        parsed_output_hash[scenario[PROJECT] + scenario[MERGE_COMMIT]] = scenario
     
     return parsed_output_hash
 

From 58bc3c3210410d98a878f8b2436a2a6234a48c76 Mon Sep 17 00:00:00 2001
From: Rafael Mota <rma7@cin.ufpe.br>
Date: Wed, 25 Dec 2019 21:05:19 -0300
Subject: [PATCH 5/6] Threat already exists errors a better way

---
 scripts/fetch_jars.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/scripts/fetch_jars.py b/scripts/fetch_jars.py
index 40fb97340..d36729194 100644
--- a/scripts/fetch_jars.py
+++ b/scripts/fetch_jars.py
@@ -33,6 +33,13 @@
 output_path = sys.argv[2].rstrip("/") # output path passed as cli argument
 token = sys.argv[3] # token passed as cli argument
 
+class AlreadyExistsException(Exception):
+    def __init__(self, file_path):
+        self.file_path = file_path
+        self.message = file_path + " already exists"
+    def __str__(self):
+        return self.message
+
 def fetch_jars(input_path, output_path, token):
     # this method reads a csv input file, with the projects name and path
     # for each project it downloads the build generated via github releases
@@ -75,6 +82,10 @@ def fetch_jars(input_path, output_path, token):
                             new_results_file.append(parsed_output_hash[project_name + commit_sha])
             
                             print ("Scenario is ready")
+                        except AlreadyExistsException as ae:
+                            new_results_file.append(parsed_output_hash[project_name + commit_sha])
+
+                            print (ae)
                         except Exception as e:
                             print ("Error downloading scenario: " + str(e))
 
@@ -110,10 +121,10 @@ def download_build(output_path, project_name, commit_sha, release):
         build_path = scenario_path + "build"
 
         if path.exists(tar_path):
-            raise Exception(tar_path + " already exists")
+            raise AlreadyExistsException(tar_path)
 
         if path.exists(build_path):
-            raise Exception(build_path + " already exists")
+            raise AlreadyExistsException(build_path)
 
         download_url = get_download_url(release)
 

From ba45ed62bb2f8c513bf5b9db03f1eb60107f8d23 Mon Sep 17 00:00:00 2001
From: Rafael Mota <rma7@cin.ufpe.br>
Date: Wed, 25 Dec 2019 22:28:49 -0300
Subject: [PATCH 6/6] Refactor and make parse_to_soot script more robust

---
 scripts/parse_to_soot.py | 94 ++++++++++++++++++----------------------
 1 file changed, 41 insertions(+), 53 deletions(-)

diff --git a/scripts/parse_to_soot.py b/scripts/parse_to_soot.py
index 6255a4edd..d8b832696 100644
--- a/scripts/parse_to_soot.py
+++ b/scripts/parse_to_soot.py
@@ -1,71 +1,59 @@
 # This script receives as input the path to a directory generated by the miningframework, it reads the output files and creates a [output]/data/results-soot.csv with the output in a format suported by a SOOT analysis framework
 
 import sys
+from csv import DictReader, writer
 
-CLASS_NAME = "class_name"
-LEFT_MODIFICATION = "leftModification"
-RIGHT_MODIFICATION = "rightModfication"
-COMMIT_SHA = "commitSha"
-PROJECT_NAME = "projectName"
+CLASS_NAME = "className"
+LEFT_MODIFICATIONS = "left modifications"
+RIGHT_MODIFICATIONS = "right modifications"
+COMMIT_SHA = "merge commit"
+PROJECT_NAME = "project"
 
-output_path = sys.argv[1] # get output path passed as cli argument
+output_path = sys.argv[1].rstrip("/") # get output path passed as cli argument
 def export_csv():
-    f = open(output_path + "/data/results.csv", "r")
-    file = f.read()
-    f.close()
+    print ("Running parse to soot")
+    scenarios = read_output(output_path)
+    
+    for scenario in scenarios:
+        base_path = get_scenario_base_path(scenario)
 
-    brute_lines = file.split("\n")
+        left_modifications = parse_modifications(scenario[LEFT_MODIFICATIONS])
+        right_modifications = parse_modifications(scenario[RIGHT_MODIFICATIONS])
+        class_name = scenario[CLASS_NAME]
 
-    parsed = parse_output(brute_lines)
-    csv = generate_csv(parsed)
+        result = []
+        result_reverse = []
+
+        for line in left_modifications:
+            if line not in right_modifications:
+                result.append([class_name, "sink", line])
+                result_reverse.append([class_name, "source", line])
+
+        for line in right_modifications:
+            if line not in left_modifications:
+                result.append([class_name, "source", line])
+                result_reverse.append([class_name, "sink", line])
 
+        if result:
+            with open(base_path + "/soot.csv", "w") as soot, open(base_path + "/soot-reverse.csv", "w") as soot_reverse:
+                soot_writer = writer(soot, delimiter=",")
+                soot_reverse_writer = writer(soot_reverse, delimiter=",")
 
-def parse_output(lines):
-    result = []
-    for line in lines[1:]:
-        cells = line.split(";")
-        if (len (cells) > 1):
-            method = {}
-            method[PROJECT_NAME] = cells[0]
-            method[COMMIT_SHA] = cells[1]
-            method[CLASS_NAME] = cells[2]
-            method[LEFT_MODIFICATION] = parse_modification(cells[4])
-            method[RIGHT_MODIFICATION] = parse_modification(cells[6])
-            result.append(method)
-    return result
+                if result:
+                    soot_writer.writerows(result)
+                    soot_reverse_writer.writerows(result_reverse)
 
-def parse_modification(modifications):
+def read_output(output_path):
+    with open(output_path + "/data/results-with-builds.csv", "r") as output_file:
+        return list(DictReader(output_file, delimiter=";"))
+
+def parse_modifications(modifications):
     trimmed_input = modifications.strip("[]").replace(" ", "")
     if (len (trimmed_input) > 0):
         return trimmed_input.split(",")
     return []
 
-def generate_csv(collection):
-    for elem in collection:
-        result = []
-        result_reverse = []
-        class_name = elem[CLASS_NAME]
-        left_modifications = elem[LEFT_MODIFICATION]
-        right_modifications = elem[RIGHT_MODIFICATION]
-        for l in left_modifications:
-            if l not in right_modifications:
-                result_reverse.append(class_name + ",sink," + l)
-                result.append(class_name + ",source," + l)
-        for r in right_modifications:
-            if r not in left_modifications:
-                result_reverse.append(class_name + ",source," + r)
-                result.append(class_name + ",sink," + r)
-        try:
-            if result:
-                base_path = output_path + "/files/" + elem[PROJECT_NAME] + "/" + elem[COMMIT_SHA]
-                save_file(base_path + "/soot.csv", result)
-                save_file(base_path + "/soot-reverse.csv", result_reverse)
-        except:
-            pass
-
-def save_file(filePath, result):
-    csv_file = open(filePath, "w")
-    csv_file.write("\n".join(result))
-    csv_file.close()
+def get_scenario_base_path(scenario):
+    return output_path + "/files/" + scenario[PROJECT_NAME] + "/" + scenario[COMMIT_SHA]
 
 export_csv()
\ No newline at end of file