diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..21c0633 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +slovene_g2p.egg-info/* +slovene_g2p/slovene_g2p.egg-info/* +build +dist diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..e6fcb18 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +recursive-include slovene_g2p/resources/ * \ No newline at end of file diff --git a/README.md b/README.md index c001bd9..b298c46 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,16 @@ # slovene_g2p A converter that converts Slovene words to their IPA and/or SAMPA transcriptions. + + + +## usage + +``` +from slovene_g2p.SloveneG2P import SloveneG2P +g2p = SloveneG2P("ipa_symbol", "cjvt_ipa_detailed_representation", "phoneme_string") +g2p.convert_to_phonetic_transcription(word="govoriti", msd_sl="Ggdd-em", morphological_pattern_code="G1.2.d") +``` + +phoneme_option can be either "ipa_symbol" or "sampa_symbol" and representation option can be either "cjvt_ipa_detailed_representation", "cjvt_ipa_robust_representation", "cjvt_sampa_detailed_representation", "cjvt_sampa_robust_representation" + +both msd_sl and morphological_pattern_code are available in sloleks 3.0 and provided by classla python package \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8cf3256 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f349118 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +nltk>=3.6.7 +classla>=1.1.0 +reldi-tokeniser>=1.0.1 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..cd73a0e --- /dev/null +++ b/setup.py @@ -0,0 +1,22 @@ +from setuptools import setup, find_packages +import os + +cwd = os.path.dirname(os.path.abspath(__file__)) + +requirements = open(os.path.join(cwd, "requirements.txt"), "r").readlines() + +with open("README.md", "r", encoding="utf-8") as readme_file: + README = readme_file.read() + + +setup( + name='slovene_g2p', + version='0.0.9', + author = "Peter Pisljar", + description = "rule based slovenian g2p", + long_description=README, + install_requires=requirements, + packages=find_packages(), + python_requires=">=3.8.0, <3.12", + include_package_data=True, +) \ No newline at end of file diff --git a/slovene_g2p.egg-info/PKG-INFO b/slovene_g2p.egg-info/PKG-INFO new file mode 100644 index 0000000..d603c73 --- /dev/null +++ b/slovene_g2p.egg-info/PKG-INFO @@ -0,0 +1,27 @@ +Metadata-Version: 2.1 +Name: slovene_g2p +Version: 0.0.9 +Summary: rule based slovenian g2p +Author: Peter Pisljar +Requires-Python: >=3.8.0, <3.12 +License-File: LICENSE +Requires-Dist: nltk>=3.6.7 +Requires-Dist: classla>=1.1.0 +Requires-Dist: reldi-tokeniser>=1.0.1 + +# slovene_g2p +A converter that converts Slovene words to their IPA and/or SAMPA transcriptions. + + + +## usage + +``` +from slovene_g2p.SloveneG2P import SloveneG2P +g2p = SloveneG2P("ipa_symbol", "cjvt_ipa_detailed_representation", "phoneme_string") +g2p.convert_to_phonetic_transcription(word="govoriti", msd_sl="Ggdd-em", morphological_pattern_code="G1.2.d") +``` + +phoneme_option can be either "ipa_symbol" or "sampa_symbol" and representation option can be either "cjvt_ipa_detailed_representation", "cjvt_ipa_robust_representation", "cjvt_sampa_detailed_representation", "cjvt_sampa_robust_representation" + +both msd_sl and morphological_pattern_code are available in sloleks 3.0 and provided by classla python package diff --git a/slovene_g2p.egg-info/SOURCES.txt b/slovene_g2p.egg-info/SOURCES.txt new file mode 100644 index 0000000..65178c7 --- /dev/null +++ b/slovene_g2p.egg-info/SOURCES.txt @@ -0,0 +1,18 @@ +LICENSE +MANIFEST.in +README.md +pyproject.toml +setup.py +slovene_g2p/SloveneG2P.py +slovene_g2p/__init__.py +slovene_g2p.egg-info/PKG-INFO +slovene_g2p.egg-info/SOURCES.txt +slovene_g2p.egg-info/dependency_links.txt +slovene_g2p.egg-info/requires.txt +slovene_g2p.egg-info/top_level.txt +slovene_g2p/resources/SloveneG2P_phoneme_set.json +slovene_g2p/resources/schwa_rules.tsv +slovene_g2p/resources/table_of_consonant_phonemes.tsv +slovene_g2p/resources/table_of_obstruent_conversions.tsv +slovene_g2p/resources/table_of_other_symbols.tsv +slovene_g2p/resources/table_of_vowel_phonemes.tsv \ No newline at end of file diff --git a/slovene_g2p.egg-info/dependency_links.txt b/slovene_g2p.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/slovene_g2p.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/slovene_g2p.egg-info/top_level.txt b/slovene_g2p.egg-info/top_level.txt new file mode 100644 index 0000000..c560697 --- /dev/null +++ b/slovene_g2p.egg-info/top_level.txt @@ -0,0 +1 @@ +slovene_g2p diff --git a/SloveneG2P.py b/slovene_g2p/SloveneG2P.py similarity index 96% rename from SloveneG2P.py rename to slovene_g2p/SloveneG2P.py index 012a645..74e2737 100644 --- a/SloveneG2P.py +++ b/slovene_g2p/SloveneG2P.py @@ -1,12 +1,27 @@ import json +import os from collections import defaultdict as dd +current_folder = os.path.dirname(__file__) class SloveneG2P: + def __init__(self): + self.ipa_converter = SloveneG2PBase("ipa_symbol", "cjvt_ipa_detailed_representation", "phoneme_string") + self.sampa_converter = SloveneG2PBase("sampa_symbol", "cjvt_sampa_detailed_representation", "phoneme_string") + + def ipa(self, word, msd, mpc): + return self.ipa_converter.convert_to_phonetic_transcription(word, msd, mpc) + + def sampa(self, word, msd, mpc): + return self.sampa_converter.convert_to_phonetic_transcription(word, msd, mpc) + + +class SloveneG2PBase: + def __init__(self, representation_option, phoneme_set_option, output_option): - self.phoneme_set_file_path = "./resources/SloveneG2P_phoneme_set.json" - self.conversion_file_path = "./resources/table_of_obstruent_conversions.tsv" + self.phoneme_set_file_path = os.path.join(current_folder, "resources/SloveneG2P_phoneme_set.json") + self.conversion_file_path = os.path.join(current_folder, "resources/table_of_obstruent_conversions.tsv") self.representation_option = representation_option self.phoneme_set_option = phoneme_set_option @@ -32,7 +47,7 @@ def __init__(self, representation_option, phoneme_set_option, output_option): # GET LIST OF SCHWA RULES self.set_schwa_combinations = set() - file_with_schwa_rules = open("./resources/schwa_rules.tsv", "r", encoding="UTF-8").readlines() + file_with_schwa_rules = open(os.path.join(current_folder, "resources/schwa_rules.tsv"), "r", encoding="UTF-8").readlines() for line in file_with_schwa_rules: all_info = line.strip("\n").split("\t") morph_code = all_info[0] @@ -40,7 +55,6 @@ def __init__(self, representation_option, phoneme_set_option, output_option): relevant_msds = all_info[2] for relevant_msd in relevant_msds.split(", "): schwa_combination = f"{morph_code} ~ {relevant_msd}" - print(schwa_combination) self.set_schwa_combinations.add(schwa_combination) # RESOURCE FUNCTION - LIST OF VOWEL GRAPHEMES diff --git a/slovene_g2p/__init__.py b/slovene_g2p/__init__.py new file mode 100644 index 0000000..c620b44 --- /dev/null +++ b/slovene_g2p/__init__.py @@ -0,0 +1 @@ +from .SloveneG2P import SloveneG2P \ No newline at end of file diff --git a/slovene_g2p/__pycache__/SloveneG2P.cpython-310.pyc b/slovene_g2p/__pycache__/SloveneG2P.cpython-310.pyc new file mode 100644 index 0000000..d8ab703 Binary files /dev/null and b/slovene_g2p/__pycache__/SloveneG2P.cpython-310.pyc differ diff --git a/slovene_g2p/__pycache__/__init__.cpython-310.pyc b/slovene_g2p/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..31dca3a Binary files /dev/null and b/slovene_g2p/__pycache__/__init__.cpython-310.pyc differ diff --git a/resources/SloveneG2P_phoneme_set.json b/slovene_g2p/resources/SloveneG2P_phoneme_set.json similarity index 100% rename from resources/SloveneG2P_phoneme_set.json rename to slovene_g2p/resources/SloveneG2P_phoneme_set.json diff --git a/resources/schwa_rules.tsv b/slovene_g2p/resources/schwa_rules.tsv similarity index 100% rename from resources/schwa_rules.tsv rename to slovene_g2p/resources/schwa_rules.tsv diff --git a/resources/table_of_consonant_phonemes.tsv b/slovene_g2p/resources/table_of_consonant_phonemes.tsv similarity index 100% rename from resources/table_of_consonant_phonemes.tsv rename to slovene_g2p/resources/table_of_consonant_phonemes.tsv diff --git a/resources/table_of_obstruent_conversions.tsv b/slovene_g2p/resources/table_of_obstruent_conversions.tsv similarity index 74% rename from resources/table_of_obstruent_conversions.tsv rename to slovene_g2p/resources/table_of_obstruent_conversions.tsv index 9eb822a..feab4cb 100644 --- a/resources/table_of_obstruent_conversions.tsv +++ b/slovene_g2p/resources/table_of_obstruent_conversions.tsv @@ -12,4 +12,8 @@ voiceless_to_voiced_obstruent C_6.1 C_6.2 š ž voiced_to_voiceless_obstruent C_9.2 C_9.1 dž č voiceless_to_voiced_obstruent C_9.1 C_9.2 č dž voiced_to_voiceless_obstruent C_8.2 C_8.1 dz c -voiceless_to_voiced_obstruent C_8.1 C_8.2 c dz \ No newline at end of file +voiceless_to_voiced_obstruent C_8.1 C_8.2 c dz +voiceless_to_voiced_obstruent C_4 C_12.1 f v +voiced_to_voiceless_obstruent C_7.2 C_7.1 ɣ h +voiceless_to_voiced_obstruent C_7.1 C_7.2 h ɣ +voiced_to_voiceless_obstruent C_1.2.3 C_1.1.2 b_f p_f \ No newline at end of file diff --git a/resources/table_of_other_symbols.tsv b/slovene_g2p/resources/table_of_other_symbols.tsv similarity index 100% rename from resources/table_of_other_symbols.tsv rename to slovene_g2p/resources/table_of_other_symbols.tsv diff --git a/resources/table_of_vowel_phonemes.tsv b/slovene_g2p/resources/table_of_vowel_phonemes.tsv similarity index 100% rename from resources/table_of_vowel_phonemes.tsv rename to slovene_g2p/resources/table_of_vowel_phonemes.tsv