cqfn · lyriccoder · Feb 3, 2021 · Feb 3, 2021 · Feb 3, 2021 · Feb 3, 2021
diff --git a/veniq/dataset_collection/dataflow/collectEM.py b/veniq/dataset_collection/dataflow/collectEM.py
@@ -0,0 +1,63 @@
+from collections import defaultdict
+from typing import List, Dict
+
+import d6tcollect
+import d6tflow
+# from veniq.dataset_collection.augmentation import InvocationType
+from joblib import Parallel, delayed
+
+from veniq.ast_framework import AST
+from veniq.ast_framework import ASTNodeType, ASTNode
+from veniq.dataset_collection.augmentation import collect_info_about_functions_without_params
+from veniq.dataset_collection.dataflow.preprocess import TaskAggregatorJavaFiles
+from veniq.utils.ast_builder import build_ast
+
+d6tcollect.submit = False
+
+
+@d6tflow.requires({'csv': TaskAggregatorJavaFiles})
+class TaskFindEM(d6tflow.tasks.TaskCache):
+    dir_to_search = d6tflow.Parameter()
+    dir_to_save = d6tflow.Parameter()
+    system_cores_qty = d6tflow.IntParameter()
+
+    def _find_EMs(self, row):
+        result_dict = {}
+        try:
+            ast = AST.build_from_javalang(build_ast(row['original_filename']))
+            classes_declaration = [
+                ast.get_subtree(node)
+                for node in ast.get_root().types
+                if node.node_type == ASTNodeType.CLASS_DECLARATION
+            ]
+            method_declarations: Dict[str, List[ASTNode]] = defaultdict(list)
+            for class_ast in classes_declaration:
+                class_declaration = class_ast.get_root()
+                collect_info_about_functions_without_params(class_declaration, method_declarations)
+
+                methods_list = list(class_declaration.methods) + list(class_declaration.constructors)
+                for method_node in methods_list:
+                    target_node = ast.get_subtree(method_node)
+                    for method_invoked in target_node.get_proxy_nodes(
+                            ASTNodeType.METHOD_INVOCATION):
+                        extracted_m_decl = method_declarations.get(method_invoked.member, [])
+                        if len(extracted_m_decl) == 1:
+                            result_dict[method_invoked.line] = [target_node, method_invoked, extracted_m_decl]
+            # print({'em_list': result_dict, 'ast': ast})
+            if result_dict:
+                print(f' ZHOPA {result_dict}')
+                return [{'em_list': result_dict, 'ast': ast}]
+            else:
+                return {}
+        except Exception:
+            pass
+
+        return {}
+
+    def run(self):
+        csv = self.inputLoad()['csv']
+        rows = [x for _, x in csv.iterrows()]
+
+        with Parallel(n_jobs=2, require='sharedmem') as parallel:
+            results = parallel((delayed(self._find_EMs)(a) for a in rows))
+        self.save({"data": [x for x in results if x]})
diff --git a/veniq/dataset_collection/dataflow/main.py b/veniq/dataset_collection/dataflow/main.py
@@ -0,0 +1,64 @@
+import os
+from argparse import ArgumentParser
+
+import d6tcollect
+import d6tflow
+
+from veniq.dataset_collection.dataflow.collectEM import TaskFindEM
+
+d6tcollect.submit = False
+
+if __name__ == '__main__':
+    system_cores_qty = os.cpu_count() or 1
+    parser = ArgumentParser()
+    parser.add_argument(
+        "-d",
+        "--dir",
+        required=True,
+        help="File path to JAVA source code for methods augmentations"
+    )
+    parser.add_argument(
+        "-o", "--output",
+        help="Path for file with output results",
+        default='augmented_data'
+    )
+    parser.add_argument(
+        "--jobs",
+        "-j",
+        type=int,
+        default=system_cores_qty - 1,
+        help="Number of processes to spawn. "
+             "By default one less than number of cores. "
+             "Be careful to raise it above, machine may stop responding while creating dataset.",
+    )
+    parser.add_argument(
+        "-z", "--zip",
+        action='store_true',
+        help="To zip input and output files."
+    )
+    parser.add_argument(
+        "-s", "--small_dataset_size",
+        help="Number of files in small dataset",
+        default=100,
+        type=int,
+    )
+
+    args = parser.parse_args()
+    d6tflow.preview(
+        TaskFindEM(
+            dir_to_search=args.dir,
+            dir_to_save=args.output,
+            system_cores_qty=args.jobs))
+    d6tflow.run(
+        TaskFindEM(
+            dir_to_search=args.dir,
+            dir_to_save=args.output,
+            system_cores_qty=args.jobs
+        ))
+    data = TaskFindEM(
+        dir_to_search=args.dir,
+        dir_to_save=args.output,
+        system_cores_qty=args.jobs
+    ).outputLoad(cached=False)
+
+    print(data)
diff --git a/veniq/dataset_collection/dataflow/preprocess.py b/veniq/dataset_collection/dataflow/preprocess.py
@@ -0,0 +1,114 @@
+import hashlib
+import re
+from pathlib import Path
+
+import d6tcollect
+import d6tflow
+import pandas as pd
+# from veniq.dataset_collection.augmentation import InvocationType
+from pebble import ProcessPool
+from tqdm import tqdm
+
+from veniq.utils.encoding_detector import read_text_with_autodetected_encoding
+
+d6tcollect.submit = False
+
+
+class TaskAggregatorJavaFiles(d6tflow.tasks.TaskCSVPandas):
+    dir_to_search = d6tflow.Parameter()
+    dir_to_save = d6tflow.Parameter()
+    system_cores_qty = d6tflow.IntParameter()
+
+    columns = [
+        'project_id',
+        'original_filename',
+        'class_name',
+        'invocation_line_string',
+        'invocation_line_number_in_original_file',
+        'target_method',
+        'target_method_start_line',
+        'extract_method',
+        'extract_method_start_line',
+        'extract_method_end_line',
+        'output_filename',
+        'is_valid_ast',
+        'insertion_start',
+        'insertion_end',
+        'ncss_target',
+        'ncss_extracted',
+        'do_nothing',
+        'ONE_LINE_FUNCTION',
+        'NO_IGNORED_CASES'
+    ]  # + [x for x in InvocationType.list_types()]
+
+    def _remove_comments(self, string: str):
+        pattern = r"(\".*?\"|\'.*?\')|(/\*.*?\*/|//[^\r\n]*$)"
+        # first group captures quoted strings (double or single)
+        # second group captures comments (//single-line or /* multi-line */)
+        regex = re.compile(pattern, re.MULTILINE | re.DOTALL)
+
+        def _replacer(match):
+            # if the 2nd group (capturing comments) is not None,
+            # it means we have captured a non-quoted (real) comment string.
+            if match.group(2) is not None:
+                # so we will return empty to remove the comment
+                return ""
+            else:  # otherwise, we will return the 1st group
+                return match.group(1)  # captured quoted-string
+
+        return regex.sub(_replacer, string)
+
+    def _preprocess(self, file):
+        original_text = read_text_with_autodetected_encoding(str(file))
+        # remove comments
+        text_without_comments = self._remove_comments(original_text)
+        # remove whitespaces
+        text = "\n".join([ll.rstrip() for ll in text_without_comments.splitlines() if ll.strip()])
+
+        return text
+
+    def _save_text_to_new_file(self, input_dir: Path, text: str, filename: Path) -> Path:
+        # need to avoid situation when filenames are the same
+        hash_path = hashlib.sha256(str(filename.parent).encode('utf-8')).hexdigest()
+        dst_filename = input_dir / f'{filename.stem}_{hash_path}.java'
+        if not dst_filename.parent.exists():
+            dst_filename.parent.mkdir(parents=True)
+        if not dst_filename.exists():
+            with open(dst_filename, 'w', encoding='utf-8') as w:
+                w.write(text)
+
+        return dst_filename
+
+    def run(self):
+        test_files = set(Path(self.dir_to_search).glob('**/*Test*.java'))
+        not_test_files = set(Path(self.dir_to_search).glob('**/*.java'))
+        files_without_tests = list(not_test_files.difference(test_files))
+        if not files_without_tests:
+            raise Exception("No java files were found")
+
+        full_dataset_folder = Path(self.dir_to_save) / 'full_dataset'
+        if not full_dataset_folder.exists():
+            full_dataset_folder.mkdir(parents=True)
+        self.output_dir = full_dataset_folder / 'output_files'
+        if not self.output_dir.exists():
+            self.output_dir.mkdir(parents=True)
+        self.input_dir = full_dataset_folder / 'input_files'
+        if not self.input_dir.exists():
+            self.input_dir.mkdir(parents=True)
+        df = pd.DataFrame(columns=['original_filename'])
+        with ProcessPool(self.system_cores_qty) as executor:
+            future = executor.map(self._preprocess, files_without_tests, timeout=200, )
+            result = future.result()
+            for filename in tqdm(files_without_tests):
+                try:
+                    text = next(result)
+                    if text:
+                        df = df.append(
+                            {'original_filename': self._save_text_to_new_file(self.input_dir, text,
+                                                                              filename).absolute()},
+                            ignore_index=True
+                        )
+                except Exception as e:
+                    print(str(e))
+
+        self.save(data=df)