diff --git a/content/papers/change_id.python b/content/papers/change_id.python deleted file mode 100644 index 666ccb9..0000000 --- a/content/papers/change_id.python +++ /dev/null @@ -1,64 +0,0 @@ -import os -import argparse - -def replace_authors_line(file_path, search_string, replace_string): - """Replaces the `authors` line in YAML frontmatter with a bulleted list.""" - try: - with open(file_path, "r", encoding="utf-8") as f: - lines = f.readlines() - except UnicodeDecodeError: - print(f"Skipping {file_path}: Unable to decode file") - return - - if not (lines and lines[0].strip() == "---"): - return # Skip files without YAML frontmatter - - # Find the end of YAML frontmatter - yaml_end_idx = 1 - while yaml_end_idx < len(lines) and lines[yaml_end_idx].strip() != "---": - yaml_end_idx += 1 - - if yaml_end_idx >= len(lines): - return # No closing `---`, skip - - # Replace only in the authors line (bulleted list) - modified = False - authors_start_idx = None - for i in range(1, yaml_end_idx): - if lines[i].strip().startswith("authors:"): - authors_start_idx = i - break - - if authors_start_idx is not None: - # Process the authors list - for i in range(authors_start_idx + 1, yaml_end_idx): - line = lines[i].strip() - if line.startswith("-"): # Check for bulleted list item - if search_string in line: - lines[i] = " " + line.replace(search_string, replace_string) + "\n" - modified = True - else: - # Stop processing if the list ends (we assume the list ends when no longer a bullet point) - break - - if modified: - with open(file_path, "w", encoding="utf-8") as f: - f.writelines(lines) - print(f"Updated authors in {file_path}") - -def process_directory(directory, search_string, replace_string): - """Recursively processes all `.md` files in a directory.""" - for root, _, files in os.walk(directory): - for file in files: - if file.endswith(".md"): # Only process .md files - file_path = os.path.join(root, file) - replace_authors_line(file_path, search_string, replace_string) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Replace a string in the authors bulleted list of YAML frontmatter.") - parser.add_argument("directory", help="Directory containing the files to process") - parser.add_argument("search_string", help="String to search for in the authors field") - parser.add_argument("replace_string", help="String to replace the search string with") - args = parser.parse_args() - - process_directory(args.directory, args.search_string, args.replace_string) diff --git a/content/papers/find_duplicate_titles.py b/content/papers/find_duplicate_titles.py deleted file mode 100644 index 9a4d97c..0000000 --- a/content/papers/find_duplicate_titles.py +++ /dev/null @@ -1,78 +0,0 @@ - -import os -import yaml -from fuzzywuzzy import fuzz - -def extract_title_from_frontmatter(file_path): - """Extracts the title from the YAML frontmatter of a markdown file.""" - try: - with open(file_path, "r", encoding="utf-8") as f: - lines = f.readlines() - except UnicodeDecodeError: - print(f"Skipping {file_path}: Unable to decode file") - return None - - if not (lines and lines[0].strip() == "---"): - return None # Skip files without YAML frontmatter - - # Find the end of YAML frontmatter - yaml_end_idx = 1 - while yaml_end_idx < len(lines) and lines[yaml_end_idx].strip() != "---": - yaml_end_idx += 1 - - if yaml_end_idx >= len(lines): - return None # No closing `---`, skip - - # Parse YAML frontmatter to get the title - try: - frontmatter = yaml.safe_load("".join(lines[1:yaml_end_idx])) - title = frontmatter.get("title", None) - return title - except yaml.YAMLError: - return None # Skip if YAML parsing fails - -def find_similar_titles(directory, similarity_threshold=80): - """Finds and prints files with duplicate or similar titles in a directory.""" - titles = {} - duplicates = [] - - # Walk through the directory recursively - for root, _, files in os.walk(directory): - for file in files: - if file.endswith(".md"): # Only process .md files - file_path = os.path.join(root, file) - title = extract_title_from_frontmatter(file_path) - if title: - # Check for exact duplicate titles first - if title in titles: - duplicates.append((file_path, title)) - else: - # Compare with all previously seen titles for similarity - for seen_title, seen_file in titles.items(): - similarity = fuzz.ratio(title.lower(), seen_title.lower()) - if similarity >= similarity_threshold: - duplicates.append((file_path, title, seen_file, seen_title, similarity)) - - titles[title] = file_path - - # Print duplicates - if duplicates: - print("Files with duplicate or similar titles:") - for entry in duplicates: - if len(entry) == 2: - print(f"Exact duplicate: Title: '{entry[1]}' found in {entry[0]}") - elif len(entry) == 5: - print(f"Similar title: '{entry[1]}' in {entry[0]} and '{entry[3]}' in {entry[2]} (Similarity: {entry[4]}%)") - print("\n") - else: - print("No duplicate or similar titles found.") - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="Find and print Markdown files with duplicate or similar titles.") - parser.add_argument("directory", help="Directory to scan for markdown files with duplicate or similar titles") - parser.add_argument("--threshold", type=int, default=80, help="Threshold for similarity percentage (default: 80%)") - args = parser.parse_args() - - find_similar_titles(args.directory, args.threshold)