From 341c74ca4e2adfb733a850605c590bf9f3797186 Mon Sep 17 00:00:00 2001 From: b-sai Date: Sun, 12 Feb 2023 11:35:06 -0600 Subject: [PATCH 1/5] initial implementation complete --- main.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 00000000..88789d3d --- /dev/null +++ b/main.py @@ -0,0 +1,40 @@ +with open("F18/data/list.txt", "r") as f: + languages = f.read().splitlines() +print(languages) + + +def to_lowercase(word: str, language: str): + """ + word: str, the string to be converted to lowercase + language: str, the language of the string, in BCP 47 format + """ + result = "" + + for idx, letter in enumerate(word): + lower_letter = letter.lower() + if language == 'tr' or language == 'az': + if letter == 'I': + lower_letter = 'ı' + elif language.startswith(('gd', 'gv', 'ga')): + if idx == 1 and letter in ['A', 'E', 'I', 'O', 'U', 'Á', 'É', 'Í', 'Ó', 'Ú'] and word[0] in ['n', 't']: + lower_letter = "-"+letter.lower() + elif language.startswith('el'): + if letter == 'Σ' and idx == len(word)-1: + lower_letter = 'ς' + elif language.startswith(("zh", "th", "ja")): + lower_letter = letter + + result += lower_letter + + return result + + +with open("S23/b-sai/tests.tsv", "r", encoding="utf-8") as f: + tests = f.read().splitlines() +for test in tests: + word, language, expected = test.split("\t") + predicted = to_lowercase(word, language) + if predicted != expected: + print(f"Failed to convert {word} to lowercase in {language}!") + print(f"Expected: {expected}") + print(f"Actual: {predicted}") From 84c113a53ff10eec0790a397e79dc465ee5bb572 Mon Sep 17 00:00:00 2001 From: b-sai Date: Sun, 12 Feb 2023 11:43:58 -0600 Subject: [PATCH 2/5] moved file to right repo --- main.py => S23/b-sai/main.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename main.py => S23/b-sai/main.py (100%) diff --git a/main.py b/S23/b-sai/main.py similarity index 100% rename from main.py rename to S23/b-sai/main.py From 3aefce88ee6ded0d10f37b4827ead3a7732ca3ab Mon Sep 17 00:00:00 2001 From: b-sai Date: Sun, 12 Feb 2023 12:34:39 -0600 Subject: [PATCH 3/5] bug fix --- S23/b-sai/main.py | 19 +++++++++++-------- S23/b-sai/tests.tsv | 5 +++++ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/S23/b-sai/main.py b/S23/b-sai/main.py index 88789d3d..8e79b250 100644 --- a/S23/b-sai/main.py +++ b/S23/b-sai/main.py @@ -1,7 +1,3 @@ -with open("F18/data/list.txt", "r") as f: - languages = f.read().splitlines() -print(languages) - def to_lowercase(word: str, language: str): """ @@ -11,12 +7,13 @@ def to_lowercase(word: str, language: str): result = "" for idx, letter in enumerate(word): + lower_letter = letter.lower() if language == 'tr' or language == 'az': if letter == 'I': lower_letter = 'ı' elif language.startswith(('gd', 'gv', 'ga')): - if idx == 1 and letter in ['A', 'E', 'I', 'O', 'U', 'Á', 'É', 'Í', 'Ó', 'Ú'] and word[0] in ['n', 't']: + if idx == 1 and (letter in ['A', 'E', 'I', 'O', 'U', 'Á', 'É', 'Í', 'Ó', 'Ú', "Ó"] or ord(letter) in [211]) and word[0] in ['n', 't'] and (len(word)-idx >= 2 and ord(word[idx+1]) != 771): lower_letter = "-"+letter.lower() elif language.startswith('el'): if letter == 'Σ' and idx == len(word)-1: @@ -29,12 +26,18 @@ def to_lowercase(word: str, language: str): return result -with open("S23/b-sai/tests.tsv", "r", encoding="utf-8") as f: +with open("tests.tsv", "r", encoding="utf-8") as f: tests = f.read().splitlines() + +num_correct = 0 for test in tests: word, language, expected = test.split("\t") predicted = to_lowercase(word, language) if predicted != expected: print(f"Failed to convert {word} to lowercase in {language}!") - print(f"Expected: {expected}") - print(f"Actual: {predicted}") + print(f"Actual: {expected}") + print(f"Predicted: {predicted}") + else: + num_correct += 1 + +print(f"Passed {num_correct} out of {len(tests)} tests.") diff --git a/S23/b-sai/tests.tsv b/S23/b-sai/tests.tsv index bb7a831e..9c6d2da9 100644 --- a/S23/b-sai/tests.tsv +++ b/S23/b-sai/tests.tsv @@ -16,3 +16,8 @@ KASIM en kasim ΠΌΛΗΣ el πόλης 官话 zh-Hans 官话 ภาษาไทย th ภาษาไทย +車 ja 車 +うさぎ ja うさぎ +ลา th ลา +ลิง th ลิง +ΚΑΘΙΣΤΕ el καθιστε \ No newline at end of file From 702f028407243ba38d4f358bfb62e0942b1516fa Mon Sep 17 00:00:00 2001 From: b-sai Date: Wed, 15 Feb 2023 17:06:59 -0600 Subject: [PATCH 4/5] adding converter --- S23/b-sai/main.py | 10 +++++----- S23/b-sai/readme.md | 5 +++++ S23/b-sai/tests.tsv | 4 +++- 3 files changed, 13 insertions(+), 6 deletions(-) create mode 100644 S23/b-sai/readme.md diff --git a/S23/b-sai/main.py b/S23/b-sai/main.py index 8e79b250..abbaf1ea 100644 --- a/S23/b-sai/main.py +++ b/S23/b-sai/main.py @@ -31,13 +31,13 @@ def to_lowercase(word: str, language: str): num_correct = 0 for test in tests: - word, language, expected = test.split("\t") + word, language, actual = test.split("\t") predicted = to_lowercase(word, language) - if predicted != expected: - print(f"Failed to convert {word} to lowercase in {language}!") - print(f"Actual: {expected}") + if predicted != actual: + print(f"COuldn't convert {word} in {language}!") + print(f"Actual: {actual}") print(f"Predicted: {predicted}") else: num_correct += 1 -print(f"Passed {num_correct} out of {len(tests)} tests.") +print(f"Successfully completed {num_correct}/{len(tests)} tests") diff --git a/S23/b-sai/readme.md b/S23/b-sai/readme.md new file mode 100644 index 00000000..3fa6b69e --- /dev/null +++ b/S23/b-sai/readme.md @@ -0,0 +1,5 @@ +### b-sai lowercase converter + +This is a simple tool to convert uppercase letters to lowercase letters in a text file in any language + +To run the python script simply run ```python3 main.py``` from the S23/b-sai/ directory. \ No newline at end of file diff --git a/S23/b-sai/tests.tsv b/S23/b-sai/tests.tsv index 9c6d2da9..3be4992f 100644 --- a/S23/b-sai/tests.tsv +++ b/S23/b-sai/tests.tsv @@ -20,4 +20,6 @@ KASIM en kasim うさぎ ja うさぎ ลา th ลา ลิง th ลิง -ΚΑΘΙΣΤΕ el καθιστε \ No newline at end of file +ΚΑΘΙΣΤΕ el καθιστε +comPuTer_**#science en computer_**#science +tACKY en tacky \ No newline at end of file From ea00c6931b4a326cf82698e449960d2d3f150953 Mon Sep 17 00:00:00 2001 From: b-sai Date: Wed, 22 Feb 2023 15:57:06 -0600 Subject: [PATCH 5/5] fix issues --- S23/b-sai/main.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/S23/b-sai/main.py b/S23/b-sai/main.py index abbaf1ea..c4fb3c61 100644 --- a/S23/b-sai/main.py +++ b/S23/b-sai/main.py @@ -5,6 +5,9 @@ def to_lowercase(word: str, language: str): language: str, the language of the string, in BCP 47 format """ result = "" + + if language.startswith(("zh", "th", "ja")): + return word.lower() for idx, letter in enumerate(word): @@ -13,13 +16,17 @@ def to_lowercase(word: str, language: str): if letter == 'I': lower_letter = 'ı' elif language.startswith(('gd', 'gv', 'ga')): - if idx == 1 and (letter in ['A', 'E', 'I', 'O', 'U', 'Á', 'É', 'Í', 'Ó', 'Ú', "Ó"] or ord(letter) in [211]) and word[0] in ['n', 't'] and (len(word)-idx >= 2 and ord(word[idx+1]) != 771): + is_2nd_letter = idx == 1 + is_exception_letter = letter in [ + 'A', 'E', 'I', 'O', 'U', 'Á', 'É', 'Í', 'Ó', 'Ú', "Ó"] + is_letter_o_latin = ord(letter) in [211] + is_beginning_exception = word[0] in ['n', 't'] + is_not_last = len(word)-idx > 1 + if is_2nd_letter and (is_exception_letter or is_letter_o_latin) and is_beginning_exception and (is_not_last and ord(word[idx+1]) != 771): lower_letter = "-"+letter.lower() elif language.startswith('el'): if letter == 'Σ' and idx == len(word)-1: lower_letter = 'ς' - elif language.startswith(("zh", "th", "ja")): - lower_letter = letter result += lower_letter