From 3393e446bbb610c553191c79ec9e621abce2a975 Mon Sep 17 00:00:00 2001 From: Animenosekai <40539549+Animenosekai@users.noreply.github.com> Date: Tue, 2 Mar 2021 19:31:46 +0100 Subject: [PATCH] [add] Adding text_to_speech in Alpha [fix] Fixing #6 [add] Adding the "cache_clean" method to Translator() --- README.md | 12 +- setup.py | 2 +- translatepy/__init__.py | 2 +- translatepy/translate.py | 21 +++ translatepy/translators/bing.py | 12 +- translatepy/translators/google.py | 59 +++++++- translatepy/translators/yandex.py | 7 +- translatepy/utils/gtoken.py | 244 ++++++++++++++++++++++++++++++ 8 files changed, 343 insertions(+), 16 deletions(-) create mode 100644 translatepy/utils/gtoken.py diff --git a/README.md b/README.md index 53ecab4..695b283 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Minimum required versions: 3.2 Incompatible versions: 2 ``` -According to Vermin, Python 3.2 is needed for the backport of typing but some may say that it is available for python versions higher than 3.0 +According to Vermin (`--backport typing`), Python 3.2 is needed for the backport of typing but some may say that it is available for python versions higher than 3.0 Always check if your Python version works with `translatepy` before using it in production @@ -47,7 +47,7 @@ You can check if you successfully installed it by printing out its version: ```bash python -c "import translatepy; print(translatepy.__version__)" # output: -translatepy v1.4 +translatepy v1.5 ``` ## List of Services @@ -127,6 +127,10 @@ translatepy.EXAMPLE_CACHES = {} translatepy.DICTIONNARY_CACHES = {} ``` +Or by calling the `Translator()` method "`clean_cache`" + +***Warning: `translatepy`'s caches are global: they are used through all instances of `Translator()`*** + ### The Translator Class It is the High API providing all of the methods and optimizations for `translatepy` - translate: To translate things @@ -141,6 +145,10 @@ When something goes wrong or nothing got found, `None` is returned. The source language while being most of the time an instance of the Language class can sometimes be a string if the conversion to the Language class failed. +An additional `"text_to_speech"` function can be found in the GoogleTranslate class (accessible with the `Translator()` class at `Translator().google_translate`). +***It is not officialy supported and is not very stable.*** + + ## Deployment This module is currently in development and might contain bugs. diff --git a/setup.py b/setup.py index 2a644c2..5adcdc8 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name = "translatepy", packages = ["translatepy"], - version = "1.4", + version = "1.5", license = "GNU General Public License v3 (GPLv3)", description = "Translate, transliterate, get the language of texts in no time with the help of multiple APIs!", author = "Anime no Sekai", diff --git a/translatepy/__init__.py b/translatepy/__init__.py index 94b80c6..23401c3 100644 --- a/translatepy/__init__.py +++ b/translatepy/__init__.py @@ -11,7 +11,7 @@ __copyright__ = 'Copyright 2021, translate' __credits__ = ['animenosekai'] __license__ = 'GNU General Public License v3 (GPLv3)' -__version__ = 'translatepy v1.4' +__version__ = 'translatepy v1.5' __maintainer__ = 'Anime no Sekai' __email__ = 'niichannomail@gmail.com' __status__ = 'Stable' diff --git a/translatepy/translate.py b/translatepy/translate.py index ca2a4c9..b77601f 100644 --- a/translatepy/translate.py +++ b/translatepy/translate.py @@ -309,4 +309,25 @@ def dictionary(self, text, destination_language, source_language=None) -> Union[ DICTIONARY_CACHES[str({"t": str(text), "d": str(destination_language), "s": str(lang)})] = response return response + def clean_cache(self) -> None: + """ + Cleans translatepy's global caches + + Returns: + None + """ + global TRANSLATION_CACHES + global TRANSLITERATION_CACHES + global SPELLCHECK_CACHES + global LANGUAGE_CACHES + global EXAMPLE_CACHES + global DICTIONARY_CACHES + + TRANSLATION_CACHES = {} + TRANSLITERATION_CACHES = {} + SPELLCHECK_CACHES = {} + LANGUAGE_CACHES = {} + EXAMPLE_CACHES = {} + DICTIONARY_CACHES = {} + #translator = Translator() diff --git a/translatepy/translators/bing.py b/translatepy/translators/bing.py index 8a419ea..433ec1d 100644 --- a/translatepy/translators/bing.py +++ b/translatepy/translators/bing.py @@ -86,7 +86,7 @@ def translate(self, text, destination_language, source_language="auto-detect") - return None, None - def example(self, text, destination_language, source_language="auto-detect") -> Union[Tuple[str, List[Example]], Tuple[None, None]]: + def example(self, text, destination_language, source_language=None, translation=None) -> Union[Tuple[str, List[Example]], Tuple[None, None]]: """ Gives examples for the given text @@ -101,9 +101,15 @@ def example(self, text, destination_language, source_language="auto-detect") -> """ try: - source_language, translation = self.translate(text, destination_language, source_language) if translation is None: - return None, None + source_language, translation = self.translate(text, destination_language, source_language) + if translation is None or source_language is None: + return None, None + else: + if source_language is None: + source_language = self.language(text) + if source_language is None: + return None, None request = post("https://www.bing.com/texamplev3", headers=HEADERS, params=PARAMS, data={'text': str(text).lower(), 'from': str(source_language), 'to': str(destination_language), 'translation': str(translation).lower()}) if request.status_code < 400: return source_language, [Example(example) for example in loads(request.text)[0]["examples"]] diff --git a/translatepy/translators/google.py b/translatepy/translators/google.py index 13b8f7f..a85f65a 100644 --- a/translatepy/translators/google.py +++ b/translatepy/translators/google.py @@ -1,8 +1,12 @@ from typing import Union from requests import get from json import loads +from urllib.parse import quote +from traceback import print_exc +from translatepy.utils.gtoken import TokenAcquirer from translatepy.utils.annotations import Tuple +from translatepy.utils.utils import convert_to_float HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36' @@ -11,7 +15,10 @@ class GoogleTranslate(): """A Python implementation of Google Translate's APIs""" def __init__(self) -> None: - pass + try: + self.token_acquirer = TokenAcquirer() + except: + self.token_acquirer = None def translate(self, text, destination_language, source_language="auto") -> Union[Tuple[str, str], Tuple[None, None]]: """ @@ -28,14 +35,15 @@ def translate(self, text, destination_language, source_language="auto") -> Union """ try: + text = quote(str(text), safe='') if source_language is None: source_language = "auto" - request = get("https://translate.googleapis.com/translate_a/single?client=gtx&dt=t&sl=" + str(source_language) + "&tl=" + str(destination_language) + "&q=" + str(text)) + request = get("https://translate.googleapis.com/translate_a/single?client=gtx&dt=t&sl=" + str(source_language) + "&tl=" + str(destination_language) + "&q=" + text) if request.status_code < 400: data = loads(request.text) return data[2], "".join([sentence[0] for sentence in data[0]]) else: - request = get("https://clients5.google.com/translate_a/t?client=dict-chrome-ex&sl=" + str(source_language) + "&tl=" + str(destination_language) + "&q=" + str(text), headers=HEADERS) + request = get("https://clients5.google.com/translate_a/t?client=dict-chrome-ex&sl=" + str(source_language) + "&tl=" + str(destination_language) + "&q=" + text, headers=HEADERS) if request.status_code < 400: data = loads(request.text) return data['ld_result']["srclangs"][0], "".join(sentence["trans"] for sentence in data["sentences"]) @@ -44,14 +52,50 @@ def translate(self, text, destination_language, source_language="auto") -> Union except: return None, None - def transliterate(): + def transliterate(self): """Transliterates the given text""" raise NotImplementedError - def define(): + def define(self): """Returns the definition of the given word""" raise NotImplementedError + def text_to_speech(self, text, source_language=None, speed=1): + """ + Gives back the text to speech result for the given text + + Args: + text: + + Returns: + bytes --> the mp3 file as bytes + None --> when an error occurs + + !! Currently doesn't seem to work well because of the Token Generation methods. + > Please refer to #234@ssut/py-googletrans if you have any problem + """ + try: + if self.token_acquirer is None: + return None + text = str(text) + textlen = len(text) + token = self.token_acquirer.do(text) + if token is None: + return None + if source_language is None: + source_language = self.language(text) + if source_language is None: + return None + text = quote(str(text), safe='') + request = get("https://translate.google.com/translate_tts?ie=UTF-8&q=" + text + "&tl=" + source_language + "&total=1&idx=0&textlen=" + textlen + "&tk=" + str(token) + "&client=webapp&prev=input&ttsspeed=" + str(convert_to_float(speed))) + if request.status_code < 400: + return request.content + else: + return None + except: + print_exc() + return None + def language(self, text) -> Union[str, None]: """ Gives back the language of the given text @@ -65,11 +109,12 @@ def language(self, text) -> Union[str, None]: """ try: - request = get("https://translate.googleapis.com/translate_a/single?client=gtx&dt=t&sl=auto&tl=ja&q=" + str(text)) + text = quote(str(text), safe='') + request = get("https://translate.googleapis.com/translate_a/single?client=gtx&dt=t&sl=auto&tl=ja&q=" + text) if request.status_code < 400: return loads(request.text)[2] else: - request = get("https://clients5.google.com/translate_a/t?client=dict-chrome-ex&sl=auto&tl=ja&q=" + str(text), headers=HEADERS) + request = get("https://clients5.google.com/translate_a/t?client=dict-chrome-ex&sl=auto&tl=ja&q=" + text, headers=HEADERS) if request.status_code < 400: return loads(request.text)['ld_result']["srclangs"][0] else: diff --git a/translatepy/translators/yandex.py b/translatepy/translators/yandex.py index cad8c6f..254d745 100644 --- a/translatepy/translators/yandex.py +++ b/translatepy/translators/yandex.py @@ -3,6 +3,7 @@ from random import randint from os.path import dirname, abspath from typing import Union +from urllib.parse import quote from safeIO import TextFile from requests import get, post @@ -73,7 +74,7 @@ def refreshSID(self) -> bool: self._last_tried = time() # maybe keep that in a file self._last_tried_cache.write(self._last_tried) # else - # do nothing as we know that yandex will rate-limit us if we ping too much their website + # do nothing as we know that yandex will rate-limit us if we ping them too much return False except: return False @@ -238,7 +239,9 @@ def language(self, text, hint=None) -> Union[str, None]: if self._sid.replace(" ", "") == "" and not self.refreshSID(): return None - url = self._base_url + "detect?sid=" + self._sid + "&srv=tr-text&text=" + str(text) + "&options=1&hint=" + str(hint) + text = quote(str(text), safe='') + + url = self._base_url + "detect?sid=" + self._sid + "&srv=tr-text&text=" + text + "&options=1&hint=" + str(hint) def _request(): """ """ diff --git a/translatepy/utils/gtoken.py b/translatepy/utils/gtoken.py new file mode 100644 index 0000000..a32e395 --- /dev/null +++ b/translatepy/utils/gtoken.py @@ -0,0 +1,244 @@ +""" +ORIGINAL FILENAME: + gtoken.py +SOURCE PROJECT: + ssut/py-googletrans (https://github.com/ssut/py-googletrans)\n +AUTHOR: + SuHun Han (@ssut on GitHub) +EXPLANATION: + Generates a ticket to access Google Translate's API + Reverse engineered by ssut on the obfuscated and minified code used by Google to generate such token, and implemented on the top of Python. + However, this could be blocked at any time. +COPYRIGHT: + Copyright (c) 2015 SuHun Han +LICENSE: + The MIT License (MIT) + + Copyright (c) 2015 SuHun Han + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +""" + +import ast +from time import time +from math import floor +from re import compile, DOTALL + +from requests import get + +HEADERS = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Pragma": "no-cache", + "Cache-Control": "no-cache", + "Host": "translate.google.com", + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15", + "Accept-Language": "fr-fr", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive" +} + +class TokenAcquirer: + """Google Translate API token generator + + translate.google.com uses a token to authorize the requests. If you are + not Google, you do have this token and will have to pay for use. + This class is the result of reverse engineering on the obfuscated and + minified code used by Google to generate such token. + + The token is based on a seed which is updated once per hour and on the + text that will be translated. + Both are combined - by some strange math - in order to generate a final + token (e.g. 744915.856682) which is used by the API to validate the + request. + + This operation will cause an additional request to get an initial + token from translate.google.com. + + Example usage: + >>> from googletrans.gtoken import TokenAcquirer + >>> acquirer = TokenAcquirer() + >>> text = 'test' + >>> tk = acquirer.do(text) + >>> tk + 950629.577246 + """ + + RE_TKK = compile(r'tkk:\'(.+?)\'', DOTALL) + RE_RAWTKK = compile(r'tkk:\'(.+?)\'', DOTALL) + + def __init__(self, host='translate.google.com'): + self.tkk = '0' + self.host = host if 'http' in host else 'https://' + host + + def _update(self): + """update tkk + """ + # we don't need to update the base TKK value when it is still valid + now = floor(int(time() * 1000) / 3600000.0) + if self.tkk and int(self.tkk.split('.')[0]) == now: + return + + r = get(self.host, headers=HEADERS) + + raw_tkk = self.RE_TKK.search(r.text) + if raw_tkk: + self.tkk = raw_tkk.group(1) + return + + try: + # this will be the same as python code after stripping out a reserved word 'var' + code = self.RE_TKK.search(r.text).group(1).replace('var ', '') + # unescape special ascii characters such like a \x3d(=) + code = code.encode().decode('unicode-escape') + except AttributeError: + raise Exception('Could not find TKK token for this request.\nSee https://github.com/ssut/py-googletrans/issues/234 for more details.') + except: + raise + + if code: + tree = ast.parse(code) + visit_return = False + operator = '+' + n, keys = 0, dict(a=0, b=0) + for node in ast.walk(tree): + if isinstance(node, ast.Assign): + name = node.targets[0].id + if name in keys: + if isinstance(node.value, ast.Num): + keys[name] = node.value.n + # the value can sometimes be negative + elif isinstance(node.value, ast.UnaryOp) and \ + isinstance(node.value.op, ast.USub): # pragma: nocover + keys[name] = -node.value.operand.n + elif isinstance(node, ast.Return): + # parameters should be set after this point + visit_return = True + elif visit_return and isinstance(node, ast.Num): + n = node.n + elif visit_return and n > 0: + # the default operator is '+' but implement some more for + # all possible scenarios + if isinstance(node, ast.Add): # pragma: nocover + pass + elif isinstance(node, ast.Sub): # pragma: nocover + operator = '-' + elif isinstance(node, ast.Mult): # pragma: nocover + operator = '*' + elif isinstance(node, ast.Pow): # pragma: nocover + operator = '**' + elif isinstance(node, ast.BitXor): # pragma: nocover + operator = '^' + # a safety way to avoid Exceptions + clause = compile('{1}{0}{2}'.format( + operator, keys['a'], keys['b']), '', 'eval') + value = eval(clause, dict(__builtin__={})) + result = '{}.{}'.format(n, value) + + self.tkk = result + + def _lazy(self, value): + """like lazy evaluation, this method returns a lambda function that + returns value given. + We won't be needing this because this seems to have been built for + code obfuscation. + + the original code of this method is as follows: + + ... code-block: javascript + + var ek = function(a) { + return function() { + return a; + }; + } + """ + return lambda: value + + def _xr(self, a, b): + size_b = len(b) + c = 0 + while c < size_b - 2: + d = b[c + 2] + d = ord(d[0]) - 87 if 'a' <= d else int(d) + d = (a % 0x100000000) >> d if '+' == b[c + 1] else a << d + a = a + d & 4294967295 if '+' == b[c] else a ^ d + + c += 3 + return a + + def acquire(self, text): + a = [] + # Convert text to ints + for i in text: + val = ord(i) + if val < 0x10000: + a += [val] + else: + # Python doesn't natively use Unicode surrogates, so account for those + a += [ + floor((val - 0x10000) / 0x400 + 0xD800), + floor((val - 0x10000) % 0x400 + 0xDC00) + ] + + b = self.tkk if self.tkk != '0' else '' + d = b.split('.') + b = int(d[0]) if len(d) > 1 else 0 + + # assume e means char code array + e = [] + g = 0 + size = len(a) + while g < size: + l = a[g] + # just append if l is less than 128(ascii: DEL) + if l < 128: + e.append(l) + # append calculated value if l is less than 2048 + else: + if l < 2048: + e.append(l >> 6 | 192) + else: + # append calculated value if l matches special condition + if (l & 64512) == 55296 and g + 1 < size and \ + a[g + 1] & 64512 == 56320: + g += 1 + l = 65536 + ((l & 1023) << 10) + (a[g] & 1023) # This bracket is important + e.append(l >> 18 | 240) + e.append(l >> 12 & 63 | 128) + else: + e.append(l >> 12 | 224) + e.append(l >> 6 & 63 | 128) + e.append(l & 63 | 128) + g += 1 + a = b + for i, value in enumerate(e): + a += value + a = self._xr(a, '+-a^+6') + a = self._xr(a, '+-3^+b+-f') + a ^= int(d[1]) if len(d) > 1 else 0 + if a < 0: # pragma: nocover + a = (a & 2147483647) + 2147483648 + a %= 1000000 # int(1E6) + + return '{}.{}'.format(a, a ^ b) + + def do(self, text): + self._update() + tk = self.acquire(text) + return tk \ No newline at end of file