perf: c; fix: cpp comments; feat: rb

bionicles · Dec 22, 2023 · 8cc3732 · 8cc3732
1 parent 8c6434e
commit 8cc3732
Show file tree

Hide file tree

Showing 5 changed files with 165 additions and 64 deletions.
diff --git a/tests/more_languages/group3/cpp_test.cpp b/tests/more_languages/group3/cpp_test.cpp
@@ -81,7 +81,7 @@ class Cat : public Animal {
 nb::bytes BuildRnnDescriptor(int input_size, int hidden_size, int num_layers,
                              int batch_size, int max_seq_length, float dropout,
                              bool bidirectional, bool cudnn_allow_tf32,
-			     int workspace_size, int reserve_space_size) {
+			     int workspace_size, int reserve_space_size/* = {} */) {
   return PackDescriptor(RnnDescriptor{
       input_size, hidden_size, num_layers, batch_size, max_seq_length, dropout,
       bidirectional, cudnn_allow_tf32, workspace_size, reserve_space_size

diff --git a/tests/more_languages/group3/ruby_test.rb b/tests/more_languages/group3/ruby_test.rb
@@ -0,0 +1,28 @@
+# ruby_test.rb
+module Greeter
+  def self.say_hello
+    puts 'Hello from the Greeter module!'
+  end
+end
+
+class HelloWorld
+  def say_hello
+    puts 'Hello, World!'
+  end
+end
+
+# A class instance variable is not shared by the class's descendants.
+class Human
+  @bar = 0
+
+  def self.bar
+    @bar
+  end
+
+  def self.bar=(value)
+    @bar = value
+  end
+end
+
+class Doctor < Human
+end
diff --git a/tests/more_languages/group_todo/ruby_test.rb b/tests/more_languages/group_todo/ruby_test.rb
diff --git a/tests/test_more_language_units.py b/tests/test_more_language_units.py
@@ -230,7 +230,7 @@ def test_more_languages_group2(
 	param2 int,
 	param3 map[string]interface{},
 	callback func(int) error,
-) (resultType, error)""", 
+) (resultType, error)""",
                 "type resultType struct",
                 "func main()",
             ],
@@ -255,7 +255,7 @@ def test_more_languages_group2(
     age: Int, 
     address: String, 
     phoneNumber: String
-)"""
+)""",
             ],
         ),
         (
@@ -267,6 +267,19 @@ def test_more_languages_group2(
                 "alias md='make debug'",
             ],
         ),
+        (
+            "tests/more_languages/group3/ruby_test.rb",
+            [
+                "module Greeter",
+                "  def self.say_hello",
+                "class HelloWorld",
+                "  def say_hello",
+                "class Human",
+                "  def self.bar",
+                "  def self.bar=(value)",
+                "class Doctor < Human",
+            ],
+        ),
     ],
 )
 def test_more_languages_group3(
@@ -662,11 +675,6 @@ def test_more_languages_group5(
 #     ["defprotocol P", "defrecord Person", "defn -main"],
 # ),
 # (
-#     "tests/more_languages/group3/cpp_test.cpp",
-#     ["class Person", "void Person::greet", "void globalGreet", "int main"],
-# ),
-# (
-# (
 #     "tests/more_languages/group3/fortran_test.f90",
 #     [
 #         "MODULE hello_mod -> TYPE person",
@@ -703,13 +711,7 @@ def test_more_languages_group5(
 #     "tests/more_languages/group4/matlab_test.m",
 #     ["classdef HelloWorld -> function greet", "function loneFun"],
 # ),
-# (
-#     "tests/more_languages/group4/ruby_test.rb",
-#     [
-#         "module Greeter -> def self.say_hello",
-#         "class HelloWorld -> def say_hello",
-#     ],
-# ),
+
 # ( # unclear how to / what to include here, might be good to skip
 #     "tests/more_languages/group5/nodemon.json",
 #     [

diff --git a/tree_plus_src/parse_file.py b/tree_plus_src/parse_file.py
@@ -83,6 +83,10 @@ def parse_file(file_path: str) -> List[str]:
         components = parse_c(contents)
     elif file_extension in {".cpp", ".cc"}:
         components = parse_cpp(contents)
+    elif file_extension == ".h":
+        # harrumph!
+        components = parse_c(contents)
+        # components = parse_cpp(contents)
     elif file_extension == ".rs":
         components = parse_rs(contents)
     elif file_extension == ".swift":
@@ -91,6 +95,8 @@ def parse_file(file_path: str) -> List[str]:
         components = parse_go(contents)
     elif file_extension == ".sh":
         components = parse_bash(contents)
+    elif file_extension == ".rb":
+        components = parse_rb(contents)
     elif file_extension == ".env":
         components = parse_dot_env(contents)
     elif file_extension == ".sql":
@@ -143,8 +149,50 @@ def parse_file(file_path: str) -> List[str]:
     return total_components
 
 
+def parse_rb(contents) -> List[str]:
+    debug_print("parse_rb")
+
+    combined_pattern = re.compile(
+        # Match class and module definitions
+        r"\n(\bclass\s+\w+(?:\s*<\s*\w+)?|\bmodule\s+\w+)|"
+        # Match method definitions (instance and class methods) with parameters
+        r"\n(\s*def\s+(self\.)?\w+[\w=]*(?:\s*\([^)]*\))?)",
+        re.DOTALL,
+    )
+
+    components = []
+
+    for match in combined_pattern.finditer(contents):
+        component = match.group().strip()
+        if match.lastindex == 2:  # It's a method definition
+            component = match.group(2).rstrip().lstrip("\n")
+        components.append(component)
+
+    return components
+
+
+def remove_c_comments(multiline_string):
+    # Pattern for block comments (/* */)
+    block_comment_pattern = r"/\*.*?\*/"
+    # Pattern for line comments (// ...) including preceding whitespace
+    line_comment_pattern = r"\s*//.*?$"
+
+    # Removing block comments
+    no_block_comments = re.sub(
+        block_comment_pattern, "", multiline_string, flags=re.DOTALL
+    )
+
+    # Removing line comments and preceding whitespace
+    cleaned_string = re.sub(
+        line_comment_pattern, "", no_block_comments, flags=re.MULTILINE
+    )
+
+    return cleaned_string
+
+
 def parse_cpp(contents) -> List[str]:
     debug_print("parse_cpp")
+    contents = remove_c_comments(contents)
 
     # Combined regex pattern to match all components
     combined_pattern = re.compile(
@@ -167,6 +215,76 @@ def parse_cpp(contents) -> List[str]:
     return components
 
 
+def parse_c(contents) -> List[str]:
+    debug_print("parse_c")
+    contents = remove_c_comments(contents)
+
+    # Combined regex pattern to match functions (including pointer return types), structs, enums, and typedefs
+    combined_pattern = re.compile(
+        # Functions (including pointer return types)
+        r"\n((?:[\w*]+\s*)+\*?\s*\w+\s*\([^)]*\)\s*\{[^}]*\})|"
+        # Structs
+        r"\nstruct\s+\w+\s*\{[^}]*\}|"
+        # Enums
+        r"\nenum\s+\w+\s*\{[^}]*\}|"
+        # Typedefs
+        r"\ntypedef\s+struct\s*\{[^}]*\}\s*\w+;",
+        re.DOTALL,
+    )
+
+    components = []
+
+    for match in combined_pattern.finditer(contents):
+        component = match.group().strip()
+        if component.startswith("typedef"):
+            # Extract only the typedef struct name
+            typedef_name = component.split("}")[1].split(";")[0].strip()
+            component = f"typedef struct {typedef_name}"
+        else:
+            # Extract only the first line for each component
+            component = component.split("{")[0].strip()
+
+        components.append(component)
+
+    return components
+
+
+# def parse_c(content: str) -> List[str]:
+#     # Define the regular expressions for function, struct, enum, and typedef
+#     # regex_function = r"((?:\w+\s+)+\*?\s*\w+\s*\([^)]*\)\s*\{[^}]*\})"
+#     regex_function = r"((?:[\w*]+\s*)+\*?\s*\w+\s*\([^)]*\)\s*\{[^}]*\})"
+#     regex_struct = r"(struct\s+\w+\s*\{[^}]*\})"
+#     regex_enum = r"(enum\s+\w+\s*\{[^}]*\})"
+#     regex_typedef = r"(typedef\s+struct\s*\{[^}]*\}\s*\w+;)"
+
+#     # Combine all regexes into a single one, each separated by '|'
+#     regex = f"{regex_function}|{regex_struct}|{regex_enum}|{regex_typedef}"
+
+#     # Find all matches
+#     matches = re.findall(regex, content, re.DOTALL)
+
+#     # Initialize the list to hold the parsed elements
+#     parsed = []
+
+#     # Iterate through the matches
+#     for match in matches:
+#         # Only one of the groups will be non-empty
+#         # Iterate through the match groups
+#         for group in match:
+#             # Append the non-empty match to the list
+#             if group:
+#                 # Check if the group is a typedef
+#                 if "typedef" in group:
+#                     # Extract only the typedef struct name
+#                     typedef_name = group.split("}")[1].split(";")[0].strip()
+#                     parsed.append(f"typedef struct {typedef_name}")
+#                 else:
+#                     # Extract only the first line for each component
+#                     parsed.append(group.split("{")[0].strip())
+
+#     return parsed
+
+
 def parse_go(contents) -> List[str]:
     debug_print("parse_go")
 
@@ -195,6 +313,7 @@ def parse_go(contents) -> List[str]:
 
     return components
 
+
 def parse_swift(contents) -> List[str]:
     debug_print("parse_swift")
 
@@ -221,6 +340,7 @@ def parse_swift(contents) -> List[str]:
 
     return components
 
+
 def parse_bash(contents) -> List[str]:
     debug_print("parse_bash")
 
@@ -1084,42 +1204,6 @@ def parse_scala(contents: str) -> List[str]:
     return result
 
 
-def parse_c(content: str) -> List[str]:
-    # Define the regular expressions for function, struct, enum, and typedef
-    # regex_function = r"((?:\w+\s+)+\*?\s*\w+\s*\([^)]*\)\s*\{[^}]*\})"
-    regex_function = r"((?:[\w*]+\s*)+\*?\s*\w+\s*\([^)]*\)\s*\{[^}]*\})"
-    regex_struct = r"(struct\s+\w+\s*\{[^}]*\})"
-    regex_enum = r"(enum\s+\w+\s*\{[^}]*\})"
-    regex_typedef = r"(typedef\s+struct\s*\{[^}]*\}\s*\w+;)"
-
-    # Combine all regexes into a single one, each separated by '|'
-    regex = f"{regex_function}|{regex_struct}|{regex_enum}|{regex_typedef}"
-
-    # Find all matches
-    matches = re.findall(regex, content, re.DOTALL)
-
-    # Initialize the list to hold the parsed elements
-    parsed = []
-
-    # Iterate through the matches
-    for match in matches:
-        # Only one of the groups will be non-empty
-        # Iterate through the match groups
-        for group in match:
-            # Append the non-empty match to the list
-            if group:
-                # Check if the group is a typedef
-                if "typedef" in group:
-                    # Extract only the typedef struct name
-                    typedef_name = group.split("}")[1].split(";")[0].strip()
-                    parsed.append(f"typedef struct {typedef_name}")
-                else:
-                    # Extract only the first line for each component
-                    parsed.append(group.split("{")[0].strip())
-
-    return parsed
-
-
 def parse_tf(contents: str) -> List[str]:
     pattern = r'(provider|resource|data|variable|output|locals|module)\s+("[^"]*"\s*"[^"]*"|"[^"]*"|\'[^\']*\'|[^\s]*)\s*[{"{]'
     matches = re.findall(pattern, contents, re.MULTILINE)