Skip to content

Commit

Permalink
perf: c; fix: cpp comments; feat: rb
Browse files Browse the repository at this point in the history
  • Loading branch information
bionicles committed Dec 22, 2023
1 parent 8c6434e commit 8cc3732
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 64 deletions.
2 changes: 1 addition & 1 deletion tests/more_languages/group3/cpp_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ class Cat : public Animal {
nb::bytes BuildRnnDescriptor(int input_size, int hidden_size, int num_layers,
int batch_size, int max_seq_length, float dropout,
bool bidirectional, bool cudnn_allow_tf32,
int workspace_size, int reserve_space_size) {
int workspace_size, int reserve_space_size/* = {} */) {
return PackDescriptor(RnnDescriptor{
input_size, hidden_size, num_layers, batch_size, max_seq_length, dropout,
bidirectional, cudnn_allow_tf32, workspace_size, reserve_space_size
Expand Down
28 changes: 28 additions & 0 deletions tests/more_languages/group3/ruby_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# ruby_test.rb
module Greeter
def self.say_hello
puts 'Hello from the Greeter module!'
end
end

class HelloWorld
def say_hello
puts 'Hello, World!'
end
end

# A class instance variable is not shared by the class's descendants.
class Human
@bar = 0

def self.bar
@bar
end

def self.bar=(value)
@bar = value
end
end

class Doctor < Human
end
13 changes: 0 additions & 13 deletions tests/more_languages/group_todo/ruby_test.rb

This file was deleted.

30 changes: 16 additions & 14 deletions tests/test_more_language_units.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def test_more_languages_group2(
param2 int,
param3 map[string]interface{},
callback func(int) error,
) (resultType, error)""",
) (resultType, error)""",
"type resultType struct",
"func main()",
],
Expand All @@ -255,7 +255,7 @@ def test_more_languages_group2(
age: Int,
address: String,
phoneNumber: String
)"""
)""",
],
),
(
Expand All @@ -267,6 +267,19 @@ def test_more_languages_group2(
"alias md='make debug'",
],
),
(
"tests/more_languages/group3/ruby_test.rb",
[
"module Greeter",
" def self.say_hello",
"class HelloWorld",
" def say_hello",
"class Human",
" def self.bar",
" def self.bar=(value)",
"class Doctor < Human",
],
),
],
)
def test_more_languages_group3(
Expand Down Expand Up @@ -662,11 +675,6 @@ def test_more_languages_group5(
# ["defprotocol P", "defrecord Person", "defn -main"],
# ),
# (
# "tests/more_languages/group3/cpp_test.cpp",
# ["class Person", "void Person::greet", "void globalGreet", "int main"],
# ),
# (
# (
# "tests/more_languages/group3/fortran_test.f90",
# [
# "MODULE hello_mod -> TYPE person",
Expand Down Expand Up @@ -703,13 +711,7 @@ def test_more_languages_group5(
# "tests/more_languages/group4/matlab_test.m",
# ["classdef HelloWorld -> function greet", "function loneFun"],
# ),
# (
# "tests/more_languages/group4/ruby_test.rb",
# [
# "module Greeter -> def self.say_hello",
# "class HelloWorld -> def say_hello",
# ],
# ),

# ( # unclear how to / what to include here, might be good to skip
# "tests/more_languages/group5/nodemon.json",
# [
Expand Down
156 changes: 120 additions & 36 deletions tree_plus_src/parse_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ def parse_file(file_path: str) -> List[str]:
components = parse_c(contents)
elif file_extension in {".cpp", ".cc"}:
components = parse_cpp(contents)
elif file_extension == ".h":
# harrumph!
components = parse_c(contents)
# components = parse_cpp(contents)
elif file_extension == ".rs":
components = parse_rs(contents)
elif file_extension == ".swift":
Expand All @@ -91,6 +95,8 @@ def parse_file(file_path: str) -> List[str]:
components = parse_go(contents)
elif file_extension == ".sh":
components = parse_bash(contents)
elif file_extension == ".rb":
components = parse_rb(contents)
elif file_extension == ".env":
components = parse_dot_env(contents)
elif file_extension == ".sql":
Expand Down Expand Up @@ -143,8 +149,50 @@ def parse_file(file_path: str) -> List[str]:
return total_components


def parse_rb(contents) -> List[str]:
debug_print("parse_rb")

combined_pattern = re.compile(
# Match class and module definitions
r"\n(\bclass\s+\w+(?:\s*<\s*\w+)?|\bmodule\s+\w+)|"
# Match method definitions (instance and class methods) with parameters
r"\n(\s*def\s+(self\.)?\w+[\w=]*(?:\s*\([^)]*\))?)",
re.DOTALL,
)

components = []

for match in combined_pattern.finditer(contents):
component = match.group().strip()
if match.lastindex == 2: # It's a method definition
component = match.group(2).rstrip().lstrip("\n")
components.append(component)

return components


def remove_c_comments(multiline_string):
# Pattern for block comments (/* */)
block_comment_pattern = r"/\*.*?\*/"
# Pattern for line comments (// ...) including preceding whitespace
line_comment_pattern = r"\s*//.*?$"

# Removing block comments
no_block_comments = re.sub(
block_comment_pattern, "", multiline_string, flags=re.DOTALL
)

# Removing line comments and preceding whitespace
cleaned_string = re.sub(
line_comment_pattern, "", no_block_comments, flags=re.MULTILINE
)

return cleaned_string


def parse_cpp(contents) -> List[str]:
debug_print("parse_cpp")
contents = remove_c_comments(contents)

# Combined regex pattern to match all components
combined_pattern = re.compile(
Expand All @@ -167,6 +215,76 @@ def parse_cpp(contents) -> List[str]:
return components


def parse_c(contents) -> List[str]:
debug_print("parse_c")
contents = remove_c_comments(contents)

# Combined regex pattern to match functions (including pointer return types), structs, enums, and typedefs
combined_pattern = re.compile(
# Functions (including pointer return types)
r"\n((?:[\w*]+\s*)+\*?\s*\w+\s*\([^)]*\)\s*\{[^}]*\})|"
# Structs
r"\nstruct\s+\w+\s*\{[^}]*\}|"
# Enums
r"\nenum\s+\w+\s*\{[^}]*\}|"
# Typedefs
r"\ntypedef\s+struct\s*\{[^}]*\}\s*\w+;",
re.DOTALL,
)

components = []

for match in combined_pattern.finditer(contents):
component = match.group().strip()
if component.startswith("typedef"):
# Extract only the typedef struct name
typedef_name = component.split("}")[1].split(";")[0].strip()
component = f"typedef struct {typedef_name}"
else:
# Extract only the first line for each component
component = component.split("{")[0].strip()

components.append(component)

return components


# def parse_c(content: str) -> List[str]:
# # Define the regular expressions for function, struct, enum, and typedef
# # regex_function = r"((?:\w+\s+)+\*?\s*\w+\s*\([^)]*\)\s*\{[^}]*\})"
# regex_function = r"((?:[\w*]+\s*)+\*?\s*\w+\s*\([^)]*\)\s*\{[^}]*\})"
# regex_struct = r"(struct\s+\w+\s*\{[^}]*\})"
# regex_enum = r"(enum\s+\w+\s*\{[^}]*\})"
# regex_typedef = r"(typedef\s+struct\s*\{[^}]*\}\s*\w+;)"

# # Combine all regexes into a single one, each separated by '|'
# regex = f"{regex_function}|{regex_struct}|{regex_enum}|{regex_typedef}"

# # Find all matches
# matches = re.findall(regex, content, re.DOTALL)

# # Initialize the list to hold the parsed elements
# parsed = []

# # Iterate through the matches
# for match in matches:
# # Only one of the groups will be non-empty
# # Iterate through the match groups
# for group in match:
# # Append the non-empty match to the list
# if group:
# # Check if the group is a typedef
# if "typedef" in group:
# # Extract only the typedef struct name
# typedef_name = group.split("}")[1].split(";")[0].strip()
# parsed.append(f"typedef struct {typedef_name}")
# else:
# # Extract only the first line for each component
# parsed.append(group.split("{")[0].strip())

# return parsed


def parse_go(contents) -> List[str]:
debug_print("parse_go")

Expand Down Expand Up @@ -195,6 +313,7 @@ def parse_go(contents) -> List[str]:

return components


def parse_swift(contents) -> List[str]:
debug_print("parse_swift")

Expand All @@ -221,6 +340,7 @@ def parse_swift(contents) -> List[str]:

return components


def parse_bash(contents) -> List[str]:
debug_print("parse_bash")

Expand Down Expand Up @@ -1084,42 +1204,6 @@ def parse_scala(contents: str) -> List[str]:
return result


def parse_c(content: str) -> List[str]:
# Define the regular expressions for function, struct, enum, and typedef
# regex_function = r"((?:\w+\s+)+\*?\s*\w+\s*\([^)]*\)\s*\{[^}]*\})"
regex_function = r"((?:[\w*]+\s*)+\*?\s*\w+\s*\([^)]*\)\s*\{[^}]*\})"
regex_struct = r"(struct\s+\w+\s*\{[^}]*\})"
regex_enum = r"(enum\s+\w+\s*\{[^}]*\})"
regex_typedef = r"(typedef\s+struct\s*\{[^}]*\}\s*\w+;)"

# Combine all regexes into a single one, each separated by '|'
regex = f"{regex_function}|{regex_struct}|{regex_enum}|{regex_typedef}"

# Find all matches
matches = re.findall(regex, content, re.DOTALL)

# Initialize the list to hold the parsed elements
parsed = []

# Iterate through the matches
for match in matches:
# Only one of the groups will be non-empty
# Iterate through the match groups
for group in match:
# Append the non-empty match to the list
if group:
# Check if the group is a typedef
if "typedef" in group:
# Extract only the typedef struct name
typedef_name = group.split("}")[1].split(";")[0].strip()
parsed.append(f"typedef struct {typedef_name}")
else:
# Extract only the first line for each component
parsed.append(group.split("{")[0].strip())

return parsed


def parse_tf(contents: str) -> List[str]:
pattern = r'(provider|resource|data|variable|output|locals|module)\s+("[^"]*"\s*"[^"]*"|"[^"]*"|\'[^\']*\'|[^\s]*)\s*[{"{]'
matches = re.findall(pattern, contents, re.MULTILINE)
Expand Down

0 comments on commit 8cc3732

Please sign in to comment.