From 234b6b56e74698d571c045b39f25f198bfd6b157 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Wed, 23 Oct 2024 12:45:14 +0200 Subject: [PATCH 01/71] Code changes to support removal of the column is_learned --- tools/old/fix_bookmark_learned_status.py | 9 ++++++++- tools/old/past_exercises_for_user.py | 6 +++--- .../core/bookmark_quality/top_bookmarks_for_user.py | 2 +- zeeguu/core/model/bookmark.py | 11 +++++++---- zeeguu/core/model/user.py | 4 ++-- zeeguu/core/sql/queries/words_to_study.sql | 3 +-- zeeguu/core/word_scheduling/basicSR/basicSR.py | 3 +-- 7 files changed, 23 insertions(+), 15 deletions(-) diff --git a/tools/old/fix_bookmark_learned_status.py b/tools/old/fix_bookmark_learned_status.py index 2209051a..b3eabff0 100644 --- a/tools/old/fix_bookmark_learned_status.py +++ b/tools/old/fix_bookmark_learned_status.py @@ -12,6 +12,11 @@ def print_bookmarks_that_are_learned_without_history(bookmarks): def print_bookmarks_that_are_wrongly_learned(bookmarks): i = 0 + # This script needs to be updated. + # bookmark.learned has been removed and now it's store only + # in bookmark.learned_time, the correct behaviour would now set the date, the + # bookmark was learned. + for bookmark in bookmarks: if not bookmark.learned_time: @@ -20,7 +25,9 @@ def print_bookmarks_that_are_wrongly_learned(bookmarks): algo_result = is_learned_based_on_exercise_outcomes(bookmark) if bookmark.learned != algo_result: - print(f"){i}) mismatch: {bookmark} DB={bookmark.learned} ALGO={algo_result}") + print( + f"){i}) mismatch: {bookmark} DB={bookmark.learned} ALGO={algo_result}" + ) print(bookmark.compact_sorted_exercise_log()) print(" ") diff --git a/tools/old/past_exercises_for_user.py b/tools/old/past_exercises_for_user.py index 49cdaf34..93b1175f 100644 --- a/tools/old/past_exercises_for_user.py +++ b/tools/old/past_exercises_for_user.py @@ -56,7 +56,7 @@ def past_exercises_for(user_id): f"{ex.time.day}/{ex.time.month} {bookmark.origin.word}({bookmark.id}) {ex.outcome.outcome} < ({past})" ) - if bookmark.learned and ex.time == bookmark.learned_time: + if bookmark.is_learned() and ex.time == bookmark.learned_time: print("Learned!") print(" ") @@ -66,7 +66,7 @@ def past_exercises_for(user_id): print( f"{btime} " + ("[fit_for_study] " if bookmark.fit_for_study else "") - + ("[Learned] " if bookmark.learned else "") + + ("[Learned] " if bookmark.is_learned() else "") + f"{bookmark.id} " + f"{bookmark.origin.word} / {bookmark.translation.word}" ) @@ -79,7 +79,7 @@ def past_exercises_for(user_id): f"{btime} " + ("[Quality] " if bookmark.quality_bookmark() else "") + ("[fit_for_study] " if bookmark.fit_for_study else "") - + ("[Learned] " if bookmark.learned else "") + + ("[Learned] " if bookmark.is_learned() else "") + f"{bookmark.id} " + f"{bookmark.origin.word} / {bookmark.translation.word}" ) diff --git a/zeeguu/core/bookmark_quality/top_bookmarks_for_user.py b/zeeguu/core/bookmark_quality/top_bookmarks_for_user.py index d59faa4f..9be6daf1 100644 --- a/zeeguu/core/bookmark_quality/top_bookmarks_for_user.py +++ b/zeeguu/core/bookmark_quality/top_bookmarks_for_user.py @@ -14,7 +14,7 @@ def rank(b): query.join(UserWord, Bookmark.origin_id == UserWord.id) .filter(UserWord.language_id == self.learned_language_id) .filter(Bookmark.user_id == self.id) - .filter(Bookmark.learned == False) + .filter(Bookmark.learned_time != None) .order_by(Bookmark.time.desc()) .limit(400) ) diff --git a/zeeguu/core/model/bookmark.py b/zeeguu/core/model/bookmark.py index ec6775cd..6a940b75 100644 --- a/zeeguu/core/model/bookmark.py +++ b/zeeguu/core/model/bookmark.py @@ -88,7 +88,6 @@ def __init__( self.text = text self.starred = False self.fit_for_study = fit_for_study(self) - self.learned = False self.learning_cycle = learning_cycle self.user_preference = UserWordExPreference.NO_PREFERENCE @@ -108,6 +107,9 @@ def serializable_dictionary(self): context=self.text.content, ) + def is_learned(self): + return self.learned_time is not None + def add_new_exercise(self, exercise): self.exercise_log.append(exercise) @@ -115,7 +117,7 @@ def translations_rendered_as_text(self): return self.translation.word def should_be_studied(self): - return (self.starred or self.fit_for_study) and not self.learned + return (self.starred or self.fit_for_study) and not self.is_learned() def content_is_not_too_long(self): return len(self.text.content) < 60 @@ -200,7 +202,9 @@ def json_serializable_dict(self, with_context=True, with_title=False): word_info = Word.stats(self.origin.word, self.origin.language.code) - learned_datetime = str(self.learned_time.date()) if self.learned_time is not None else "" + learned_datetime = ( + str(self.learned_time.date()) if self.learned_time is not None else "" + ) created_day = "today" if self.time.date() == datetime.now().date() else "" @@ -393,7 +397,6 @@ def update_learned_status(self, session): if is_learned: log(f"Log: {exercise_log.summary()}: bookmark {self.id} learned!") self.learned_time = exercise_log.last_exercise_time() - self.learned = True session.add(self) else: log(f"Log: {exercise_log.summary()}: bookmark {self.id} not learned yet.") diff --git a/zeeguu/core/model/user.py b/zeeguu/core/model/user.py index 7db94f32..7c1cb23a 100644 --- a/zeeguu/core/model/user.py +++ b/zeeguu/core/model/user.py @@ -490,9 +490,9 @@ def learned_bookmarks(self, count=50): query.join(UserWord, Bookmark.origin_id == UserWord.id) .filter(UserWord.language_id == self.learned_language_id) .filter(Bookmark.user_id == self.id) - .filter(Bookmark.learned == True) + .filter(Bookmark.learned_time != None) .order_by(Bookmark.learned_time.desc()) - .limit(400) + .limit(count) ) return learned diff --git a/zeeguu/core/sql/queries/words_to_study.sql b/zeeguu/core/sql/queries/words_to_study.sql index ce6f9bb8..c9fefa87 100644 --- a/zeeguu/core/sql/queries/words_to_study.sql +++ b/zeeguu/core/sql/queries/words_to_study.sql @@ -7,7 +7,6 @@ select uw.word, tw.word, uw.rank, - b.learned, b.fit_for_study, b.learned_time, bss.id @@ -18,7 +17,7 @@ from join user_word tw on b.translation_id = tw.id left join basic_sr_schedule bss on b.id = bss.bookmark_id where - b.learned = 0 + b.learned_time is null and b.fit_for_study and bss.id is null -- parameters and u.id = :user_id diff --git a/zeeguu/core/word_scheduling/basicSR/basicSR.py b/zeeguu/core/word_scheduling/basicSR/basicSR.py index 019219bd..31aad015 100644 --- a/zeeguu/core/word_scheduling/basicSR/basicSR.py +++ b/zeeguu/core/word_scheduling/basicSR/basicSR.py @@ -49,7 +49,6 @@ def __init__(self, bookmark=None, bookmark_id=None): self.cooling_interval = 0 def set_bookmark_as_learned(self, db_session): - self.bookmark.learned = True self.bookmark.learned_time = datetime.now() db_session.add(self.bookmark) db_session.delete(self) @@ -211,7 +210,7 @@ def get_unscheduled_bookmarks_for_user(cls, user): unscheduled_bookmarks = ( Bookmark.query.filter(Bookmark.user_id == user.id) .outerjoin(BasicSRSchedule) - .filter(Bookmark.learned == 0) + .filter(Bookmark.learned_time == None) .filter(Bookmark.fit_for_study == 1) .join(UserWord, Bookmark.origin_id == UserWord.id) .filter(UserWord.language_id == user.learned_language_id) From 64f50648e92fe6413edafd5edc6659c1a1fc2530 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Wed, 23 Oct 2024 13:42:57 +0200 Subject: [PATCH 02/71] Added sql script --- tools/migrations/24-10-23--delete_bookmark_learned.sql | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 tools/migrations/24-10-23--delete_bookmark_learned.sql diff --git a/tools/migrations/24-10-23--delete_bookmark_learned.sql b/tools/migrations/24-10-23--delete_bookmark_learned.sql new file mode 100644 index 00000000..074405a7 --- /dev/null +++ b/tools/migrations/24-10-23--delete_bookmark_learned.sql @@ -0,0 +1,2 @@ +ALTER TABLE + `zeeguu_test`.`bookmark` DROP COLUMN `learned`; \ No newline at end of file From 17ab5732a800d89dd904c8993cc02e1c6d12aacf Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Wed, 23 Oct 2024 13:43:17 +0200 Subject: [PATCH 03/71] Removed property + SQL in code --- zeeguu/core/model/bookmark.py | 2 -- zeeguu/core/sql/learner/exercises_history.py | 6 +++--- zeeguu/core/sql/learner/words.py | 8 ++++---- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/zeeguu/core/model/bookmark.py b/zeeguu/core/model/bookmark.py index 6a940b75..c3e32b7f 100644 --- a/zeeguu/core/model/bookmark.py +++ b/zeeguu/core/model/bookmark.py @@ -60,8 +60,6 @@ class Bookmark(db.Model): starred = db.Column(db.Boolean, default=False) - learned = db.Column(db.Boolean, default=False) - fit_for_study = db.Column(db.Boolean) learned_time = db.Column(db.DateTime) diff --git a/zeeguu/core/sql/learner/exercises_history.py b/zeeguu/core/sql/learner/exercises_history.py index 54fb1f68..75fd82a9 100644 --- a/zeeguu/core/sql/learner/exercises_history.py +++ b/zeeguu/core/sql/learner/exercises_history.py @@ -12,7 +12,6 @@ def exercises_in_session(session_id: int): o_uw.word, t_uw.word as translation, b.id as bookmark_id, - b.`learned` from exercise as e join exercise_outcome as eo on e.outcome_id = eo.id @@ -44,7 +43,6 @@ def exercise_history(user_id: int, language_id: int, from_date: str, to_date: st o_uw.word, t_uw.word as translation, b.id as bookmark_id, - b.`learned` from exercise as e join exercise_outcome as eo on e.outcome_id = eo.id @@ -72,7 +70,9 @@ def exercise_history(user_id: int, language_id: int, from_date: str, to_date: st ) -def exercises_grouped_by_word(user_id: int, language_id: int, from_date: str, to_date: str): +def exercises_grouped_by_word( + user_id: int, language_id: int, from_date: str, to_date: str +): exercise_details_list = exercise_history(user_id, language_id, from_date, to_date) practiced_dict = {} diff --git a/zeeguu/core/sql/learner/words.py b/zeeguu/core/sql/learner/words.py index 1c20e73b..92de2400 100644 --- a/zeeguu/core/sql/learner/words.py +++ b/zeeguu/core/sql/learner/words.py @@ -67,7 +67,7 @@ def learned_words(user_id, language_id, from_date: str, to_date: str): and b.learned_time < :to_date -- '2021-06-23' and o_uw.language_id = :language_id -- 2 and b.user_id = :user_id -- 2953 - and learned = 1 + and b.learned_time is NOT NULL order by b.learned_time """ @@ -86,8 +86,8 @@ def learned_words(user_id, language_id, from_date: str, to_date: str): each["self_reported"] = ( bookmark.sorted_exercise_log().last_exercise().is_too_easy() ) - each[ - "most_recent_correct_dates" - ] = bookmark.sorted_exercise_log().str_most_recent_correct_dates() + each["most_recent_correct_dates"] = ( + bookmark.sorted_exercise_log().str_most_recent_correct_dates() + ) return results From 3cd2d3f0ee4d041e65f02b13102abaf27ffae70b Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Wed, 23 Oct 2024 14:19:48 +0200 Subject: [PATCH 04/71] Fixed broken SQL query --- zeeguu/core/sql/learner/exercises_history.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/zeeguu/core/sql/learner/exercises_history.py b/zeeguu/core/sql/learner/exercises_history.py index 75fd82a9..35771b91 100644 --- a/zeeguu/core/sql/learner/exercises_history.py +++ b/zeeguu/core/sql/learner/exercises_history.py @@ -11,8 +11,7 @@ def exercises_in_session(session_id: int): e.solving_speed, o_uw.word, t_uw.word as translation, - b.id as bookmark_id, - + b.id as bookmark_id from exercise as e join exercise_outcome as eo on e.outcome_id = eo.id join exercise_source as es on e.source_id = es.id @@ -42,8 +41,7 @@ def exercise_history(user_id: int, language_id: int, from_date: str, to_date: st e.solving_speed, o_uw.word, t_uw.word as translation, - b.id as bookmark_id, - + b.id as bookmark_id from exercise as e join exercise_outcome as eo on e.outcome_id = eo.id join exercise_source as es on e.source_id = es.id From 3c4fe8eec7551bd7a2c00e2ae9e363d74987ccf1 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Wed, 23 Oct 2024 14:19:59 +0200 Subject: [PATCH 05/71] Updated scheduling_algo to use learned_time --- .../adaptive/scheduling_algo.py | 123 ++++++++++-------- 1 file changed, 68 insertions(+), 55 deletions(-) diff --git a/zeeguu/core/word_scheduling/adaptive/scheduling_algo.py b/zeeguu/core/word_scheduling/adaptive/scheduling_algo.py index 11b4d843..2df65247 100644 --- a/zeeguu/core/word_scheduling/adaptive/scheduling_algo.py +++ b/zeeguu/core/word_scheduling/adaptive/scheduling_algo.py @@ -1,6 +1,4 @@ - - -# information needed for selecting words to practice +# information needed for selecting words to practice """ - nextDueDate = when the user should practice this again - coolingInterval = how much will they have to wait till they see this word again @@ -17,16 +15,16 @@ to insert new words in the study """ - # scenario 1: - # - a user has translated 100 words; then they have - # two minutes of practice; they practice 10 words - # - next day they open the exercises again; what - # do we want to present them with? old words or - # new ones? - # - well, if they started learning some words we - # should prioritize those; but once they're done - # with the things they should study for the day - # we can go and catch up with older words... +# scenario 1: +# - a user has translated 100 words; then they have +# two minutes of practice; they practice 10 words +# - next day they open the exercises again; what +# do we want to present them with? old words or +# new ones? +# - well, if they started learning some words we +# should prioritize those; but once they're done +# with the things they should study for the day +# we can go and catch up with older words... """ **** coolingInterval *** """ @@ -37,7 +35,7 @@ # tomorrow; he got it wrong; we set it to 1min - would be # good to remind them sooner about it -# - user sees +# - user sees """ Rule #2: currentCoolingInterval should be used in prioritizing @@ -52,11 +50,11 @@ # if we want to add awareness of difficulty levels for # exercises, then get words to study should be called -# with the difficulty in mind +# with the difficulty in mind # e.g. an exercise session that plans to have 3 recognize, 3 audio, and 3 recall -# should make this explicit; such that we don't get very difficult words in -# recall... although maybe we can simply let the randomization -# do it's job for now; and ensure +# should make this explicit; such that we don't get very difficult words in +# recall... although maybe we can simply let the randomization +# do it's job for now; and ensure from datetime import datetime, timedelta @@ -71,42 +69,44 @@ def getWordsToStudy(user, numberOfWords): """ - :param: number of words -- how many words to be studied - the system should be fast enough such as to allow for - a query for + :param: number of words -- how many words to be studied + the system should be fast enough such as to allow for + a query for - :param: language - a user might be studying in multiple - languages; let the algo choose which language is the - user interested in + :param: language - a user might be studying in multiple + languages; let the algo choose which language is the + user interested in """ # to think about # - what happens if they have added a lot of words but they # have practiced very little? all their due words are studied - # and now we have to schedule a new word: do we select one + # and now we have to schedule a new word: do we select one # that has high frequency but was first studied two weeks ago? - # or one that has lower frequency but was seen today? - # ... we could ask them? + # or one that has lower frequency but was seen today? + # ... we could ask them? # ... we could prioritize by frequency - + # - we promise to prioritize starred words; how do we exactly - # do that? + # do that? # - i guess in two situations: - # 1. we need to schedule a new word that has not been - # rehearsed yet; we look at the starred ones and + # 1. we need to schedule a new word that has not been + # rehearsed yet; we look at the starred ones and # select one from them; if there's frequency, go with that # else go with (shortness of word/expression as a proxy?) # what if in this way we never get to schedule frequent ones? # well, we promised to schedule starred; so we should do that # 2. we have a bunch of words that are due today or before today - # we schedule starred ones first, and then non-starred; + # we schedule starred ones first, and then non-starred; # if they don't get to do any non-starred; that's ok; they'll # hopefully do them next time - + _now = datetime.now() due_words = WordToStudy.query - due_words = due_words.filter(WordToStudy.user_id == user.id).filter(WordToStudy.language_id==user.learned_language_id) - due_words = due_words.filter(WordToStudy.nextDueDate<_now) + due_words = due_words.filter(WordToStudy.user_id == user.id).filter( + WordToStudy.language_id == user.learned_language_id + ) + due_words = due_words.filter(WordToStudy.nextDueDate < _now) due_words = due_words.limit(numberOfWords).all() stillNecessary = numberOfWords - len(due_words) @@ -115,22 +115,32 @@ def getWordsToStudy(user, numberOfWords): # we need to add other words to study # try first to get the starred words first - new_words = Bookmark.query.filter_by(user_id=user.id).filter_by(learned=False).filter_by(starred=True) - new_words = new_words.filter(Bookmark.fit_for_study==True) + new_words = ( + Bookmark.query.filter_by(user_id=user.id) + .filter_by(learned_date != None) + .filter_by(starred=True) + ) + new_words = new_words.filter(Bookmark.fit_for_study == True) new_words = new_words.join(UserWord, Bookmark.origin_id == UserWord.id) - new_words = new_words.filter( ~exists().where(WordToStudy.bookmark_id==Bookmark.id)) + new_words = new_words.filter( + ~exists().where(WordToStudy.bookmark_id == Bookmark.id) + ) new_words = new_words.filter(UserWord.language_id == user.learned_language_id) new_words = new_words.order_by(Bookmark.starred, func.length(UserWord.word)) - new_words = new_words.filter_by(language_id=user.learned_language_id).limit(stillNecessary).all() + new_words = ( + new_words.filter_by(language_id=user.learned_language_id) + .limit(stillNecessary) + .all() + ) stillNecessary = numberOfWords - len(due_words) - len(new_words) if stillNecessary > 0: # we can bring some more words - # we can go with unstarred + # we can go with unstarred pass - + due_bookmarks = [dw.bookmark for dw in due_words] print("DUE: ") print(due_bookmarks) @@ -141,7 +151,7 @@ def getWordsToStudy(user, numberOfWords): # scenario 2: -# - a user starts with +# - a user starts with def updateSchedulingInfo(session, bookmark, outcome): @@ -149,39 +159,42 @@ def updateSchedulingInfo(session, bookmark, outcome): # to think about # - what do we do this is the first time a user # did a correct exercise, but it was not in - # the WordsToStudy yet? - # + # the WordsToStudy yet? + # _now = datetime.now() try: word_to_study = WordToStudy.find(bookmark) - + # if we continue here, it means that we're not seeing this word # for the first time - if outcome == 'C': - word_to_study.coolingInterval = word_to_study.coolingInterval * 2 if word_to_study.coolingInterval > 0 else 1 - word_to_study.consecutiveCorrects = word_to_study.consecutiveCorrects +1 + if outcome == "C": + word_to_study.coolingInterval = ( + word_to_study.coolingInterval * 2 + if word_to_study.coolingInterval > 0 + else 1 + ) + word_to_study.consecutiveCorrects = word_to_study.consecutiveCorrects + 1 else: # to think about # resetting is too harsh (especially if, one has a typo? or it's a high difficulty exercise?) word_to_study.coolingInterval = 0 - word_to_study.consecutiveCorrects = 0 - + word_to_study.consecutiveCorrects = 0 + word_to_study.nextDueDate = _now + timedelta(days=word_to_study.coolingInterval) - + except NoResultFound: # The first time we've seen this word word_to_study = WordToStudy(bookmark.user, bookmark) - if outcome == 'C': + if outcome == "C": word_to_study.coolingInterval = 1 word_to_study.consecutiveCorrects = 1 - else: + else: word_to_study.coolingInterval = 0 word_to_study.consecutiveCorrects = 0 word_to_study.nextDueDate = _now + timedelta(days=word_to_study.coolingInterval) - + session.add(word_to_study) session.commit() - From ee4d3d274c8d92bb22355546f49aed8a28994976 Mon Sep 17 00:00:00 2001 From: merleschoen Date: Thu, 24 Oct 2024 10:34:20 +0200 Subject: [PATCH 06/71] steps to connect and load db in DBeaver --- README.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 07789229..8a9bd12a 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,7 @@ So for running the development server this is ok, but for actual development, th # From docker-compose on Mac OS ## Starting the API + - create a local folder where you want to store zeeguu data, e.g. `mkdir /Users/mircea/zeeguu-data` - make sure that you have `envsubst` installed (i.e. `brew install gettext`) - copy the content of `default_env` to a newly created `.env` file @@ -89,9 +90,9 @@ So for running the development server this is ok, but for actual development, th - run `docker compose up` - once everything is up, go to `localhost:8080/available_languages`: if you see an array like `["de", "es", "fr", "nl", "en"]` you have the API working. -## Developing +## Developing -Once you make changes to the code you have to restart the apache2ctl inside the container. To test this do the following: +Once you make changes to the code you have to restart the apache2ctl inside the container. To test this do the following: - try to change the implementaiton of `available_languages` in `system_languages.py` and then run `docker exec -it api-zapi-1 apache2ctl restart` @@ -118,3 +119,11 @@ I define the following: export MYSQLCLIENT_CFLAGS="-I/opt/homebrew/opt/mysql-client/include/mysql/" export MYSQLCLIENT_LDFLAGS="-L/opt/homebrew/opt/mysql-client/lib -lmysqlclient" ``` + +## Connecting and loading a database in DBeaver + +- expose port 3306 to connect to local db by adding `- ports:"3306-3306"` to your docker-compose file +- create a new database connection in DBeaver and use Server Host `localhost`and Port `3306` +- import data to your local db by adding a `backups` folder to zeeguu-data and adding volume `- ${ZEEGUU_DATA_FOLDER}/backups:/backups` to the docker-compose file +- run `docker exec -it sh` +- run `mysql -uroot -p -h localhost zeeguutest < zeeguu_db_anon_2024-10-16.sql` and enter the root password From dc5c5aec07696c58b310f3e340f084133fbba2e5 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Thu, 24 Oct 2024 13:12:57 +0200 Subject: [PATCH 07/71] Limit query when generating exercises --- zeeguu/api/endpoints/exercises.py | 14 ++++++ zeeguu/core/model/user.py | 12 ++++-- .../core/word_scheduling/basicSR/basicSR.py | 43 ++++++++++++++----- 3 files changed, 55 insertions(+), 14 deletions(-) diff --git a/zeeguu/api/endpoints/exercises.py b/zeeguu/api/endpoints/exercises.py index fcf8d4d8..c3c5ff43 100644 --- a/zeeguu/api/endpoints/exercises.py +++ b/zeeguu/api/endpoints/exercises.py @@ -35,9 +35,16 @@ def top_bookmarks_to_study(): Return all the possible bookmarks a user has to study ordered by how common it is in the language and how close they are to being learned. """ + import time + + start = time.time() user = User.find_by_id(flask.g.user_id) to_study = user.bookmarks_to_study(scheduled_only=False) json_bookmarks = [bookmark.json_serializable_dict() for bookmark in to_study] + end = time.time() - start + print( + f"### INFO: `top_bookmarks_to_study` took: {end:.4f} seconds, total: {len(json_bookmarks)}" + ) return json_result(json_bookmarks) @@ -49,9 +56,16 @@ def bookmarks_to_learn_not_scheduled(): Return all the bookmarks that aren't learned and haven't been scheduled to the user. """ + import time + + start = time.time() user = User.find_by_id(flask.g.user_id) to_study = user.bookmarks_to_learn_not_in_pipeline() json_bookmarks = [bookmark.json_serializable_dict() for bookmark in to_study] + end = time.time() - start + print( + f"### INFO: `bookmarks_to_learn_not_scheduled` took: {end:.4f} seconds, total: {len(json_bookmarks)}" + ) return json_result(json_bookmarks) diff --git a/zeeguu/core/model/user.py b/zeeguu/core/model/user.py index 7db94f32..d07f66bc 100644 --- a/zeeguu/core/model/user.py +++ b/zeeguu/core/model/user.py @@ -222,7 +222,7 @@ def set_learned_language_level( def has_bookmarks(self): return self.bookmark_count() > 0 - def bookmarks_to_study(self, bookmark_count=None, scheduled_only=False): + def bookmarks_to_study(self, bookmark_count=100, scheduled_only=False): """ We now use a logic to sort the words, if we call this everytime we want similar words it might bottleneck the application. @@ -234,9 +234,13 @@ def bookmarks_to_study(self, bookmark_count=None, scheduled_only=False): from zeeguu.core.word_scheduling.basicSR.basicSR import BasicSRSchedule if scheduled_only: - to_study = BasicSRSchedule.priority_scheduled_bookmarks_to_study(self) + to_study = BasicSRSchedule.priority_scheduled_bookmarks_to_study( + self, bookmark_count + ) else: - to_study = BasicSRSchedule.all_bookmarks_priority_to_study(self) + to_study = BasicSRSchedule.all_bookmarks_priority_to_study( + self, bookmark_count + ) return to_study if bookmark_count is None else to_study[:bookmark_count] def get_new_bookmarks_to_study(self, bookmarks_count): @@ -291,7 +295,7 @@ def bookmarks_to_learn_not_in_pipeline(self): from zeeguu.core.word_scheduling.basicSR.basicSR import BasicSRSchedule words_not_started_learning = BasicSRSchedule.get_unscheduled_bookmarks_for_user( - self + self, None ) return words_not_started_learning diff --git a/zeeguu/core/word_scheduling/basicSR/basicSR.py b/zeeguu/core/word_scheduling/basicSR/basicSR.py index 019219bd..347a0508 100644 --- a/zeeguu/core/word_scheduling/basicSR/basicSR.py +++ b/zeeguu/core/word_scheduling/basicSR/basicSR.py @@ -188,7 +188,7 @@ def find_or_create(cls, db_session, bookmark): return schedule @classmethod - def get_scheduled_bookmarks_for_user(cls, user): + def get_scheduled_bookmarks_for_user(cls, user, limit): end_of_day = cls.get_end_of_today() # Get the candidates, words that are to practice scheduled_candidates_query = ( @@ -204,10 +204,14 @@ def get_scheduled_bookmarks_for_user(cls, user): scheduled_candidates_query = scheduled_candidates_query.filter( Bookmark.learning_cycle == LearningCycle.RECEPTIVE ) - return scheduled_candidates_query.all() + scheduled_candidates_query.order_by(-UserWord.rank.desc(), cls.cooling_interval) + if limit is None: + return scheduled_candidates_query.all() + else: + return scheduled_candidates_query.limit(limit).all() @classmethod - def get_unscheduled_bookmarks_for_user(cls, user): + def get_unscheduled_bookmarks_for_user(cls, user, limit): unscheduled_bookmarks = ( Bookmark.query.filter(Bookmark.user_id == user.id) .outerjoin(BasicSRSchedule) @@ -216,9 +220,12 @@ def get_unscheduled_bookmarks_for_user(cls, user): .join(UserWord, Bookmark.origin_id == UserWord.id) .filter(UserWord.language_id == user.learned_language_id) .filter(BasicSRSchedule.cooling_interval == None) - .all() + .order_by(-UserWord.rank.desc()) ) - return unscheduled_bookmarks + if limit is None: + return unscheduled_bookmarks.all() + else: + return unscheduled_bookmarks.limit(limit).all() @classmethod def remove_duplicated_bookmarks(cls, bookmark_list): @@ -239,7 +246,7 @@ def remove_duplicated_bookmarks(cls, bookmark_list): return candidates_no_duplicates @classmethod - def all_bookmarks_priority_to_study(cls, user): + def all_bookmarks_priority_to_study(cls, user, limit): """ Looks at all the bookmarks available to the user and prioritizes them based on the Rank of the words. @@ -252,8 +259,13 @@ def all_bookmarks_priority_to_study(cls, user): 1. Words that are most common in the language (utilizing the word rank in the db 2. Words that are closest to being learned (indicated by `cooling_interval`, the highest the closest it is) """ + import time + + start = time.time() def priority_by_rank(bookmark): + # If this is updated remember to update the order_by in + # get_scheduled_bookmarks_for_user and get_unscheduled_bookmarks_for_user bookmark_info = bookmark.json_serializable_dict() cooling_interval = bookmark_info["cooling_interval"] cooling_interval = cooling_interval if cooling_interval is not None else -1 @@ -262,18 +274,22 @@ def priority_by_rank(bookmark): word_rank = UserWord.IMPOSSIBLE_RANK return word_rank, -cooling_interval - scheduled_candidates = cls.get_scheduled_bookmarks_for_user(user) - unscheduled_bookmarks = cls.get_unscheduled_bookmarks_for_user(user) + scheduled_candidates = cls.get_scheduled_bookmarks_for_user(user, limit) + unscheduled_bookmarks = cls.get_unscheduled_bookmarks_for_user(user, limit) all_possible_bookmarks = scheduled_candidates + unscheduled_bookmarks no_duplicate_bookmarks = cls.remove_duplicated_bookmarks(all_possible_bookmarks) sorted_candidates = sorted( no_duplicate_bookmarks, key=lambda x: priority_by_rank(x) ) + end = time.time() - start + print( + f"### INFO: `all_bookmarks_priority_to_study` took: {end:.4f} seconds, total: {len(sorted_candidates)}" + ) return sorted_candidates @classmethod - def priority_scheduled_bookmarks_to_study(cls, user): + def priority_scheduled_bookmarks_to_study(cls, user, limit): """ Prioritizes the bookmarks to study. To randomize the exercise order utilize the Frontend assignBookmarksToExercises.js @@ -285,6 +301,9 @@ def priority_scheduled_bookmarks_to_study(cls, user): 1. Words that are closest to being learned (indicated by `cooling_interval`, the highest the closest it is) 2. Words that are most common in the language (utilizing the word rank in the db) """ + import time + + start = time.time() def priority_by_cooling_interval(bookmark): bookmark_info = bookmark.json_serializable_dict() @@ -295,12 +314,16 @@ def priority_by_cooling_interval(bookmark): word_rank = UserWord.IMPOSSIBLE_RANK return -cooling_interval, word_rank - scheduled_candidates = cls.get_scheduled_bookmarks_for_user(user) + scheduled_candidates = cls.get_scheduled_bookmarks_for_user(user, limit) no_duplicate_bookmarks = cls.remove_duplicated_bookmarks(scheduled_candidates) sorted_candidates = sorted( no_duplicate_bookmarks, key=lambda x: priority_by_cooling_interval(x) ) + end = time.time() - start + print( + f"### INFO: `priority_scheduled_bookmarks_to_study` took: {end:.4f} seconds" + ) return sorted_candidates @classmethod From 6a29e7fe5bb0a2eb9e8e1cb956a357b851606a62 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Fri, 25 Oct 2024 10:13:56 +0200 Subject: [PATCH 08/71] Removed time measurements --- zeeguu/api/endpoints/exercises.py | 15 +-------------- zeeguu/core/word_scheduling/basicSR/basicSR.py | 14 -------------- 2 files changed, 1 insertion(+), 28 deletions(-) diff --git a/zeeguu/api/endpoints/exercises.py b/zeeguu/api/endpoints/exercises.py index c3c5ff43..47c2db3e 100644 --- a/zeeguu/api/endpoints/exercises.py +++ b/zeeguu/api/endpoints/exercises.py @@ -35,16 +35,9 @@ def top_bookmarks_to_study(): Return all the possible bookmarks a user has to study ordered by how common it is in the language and how close they are to being learned. """ - import time - - start = time.time() user = User.find_by_id(flask.g.user_id) to_study = user.bookmarks_to_study(scheduled_only=False) json_bookmarks = [bookmark.json_serializable_dict() for bookmark in to_study] - end = time.time() - start - print( - f"### INFO: `top_bookmarks_to_study` took: {end:.4f} seconds, total: {len(json_bookmarks)}" - ) return json_result(json_bookmarks) @@ -56,16 +49,10 @@ def bookmarks_to_learn_not_scheduled(): Return all the bookmarks that aren't learned and haven't been scheduled to the user. """ - import time - - start = time.time() user = User.find_by_id(flask.g.user_id) to_study = user.bookmarks_to_learn_not_in_pipeline() json_bookmarks = [bookmark.json_serializable_dict() for bookmark in to_study] - end = time.time() - start - print( - f"### INFO: `bookmarks_to_learn_not_scheduled` took: {end:.4f} seconds, total: {len(json_bookmarks)}" - ) + return json_result(json_bookmarks) diff --git a/zeeguu/core/word_scheduling/basicSR/basicSR.py b/zeeguu/core/word_scheduling/basicSR/basicSR.py index 347a0508..d594da66 100644 --- a/zeeguu/core/word_scheduling/basicSR/basicSR.py +++ b/zeeguu/core/word_scheduling/basicSR/basicSR.py @@ -259,9 +259,6 @@ def all_bookmarks_priority_to_study(cls, user, limit): 1. Words that are most common in the language (utilizing the word rank in the db 2. Words that are closest to being learned (indicated by `cooling_interval`, the highest the closest it is) """ - import time - - start = time.time() def priority_by_rank(bookmark): # If this is updated remember to update the order_by in @@ -282,10 +279,6 @@ def priority_by_rank(bookmark): sorted_candidates = sorted( no_duplicate_bookmarks, key=lambda x: priority_by_rank(x) ) - end = time.time() - start - print( - f"### INFO: `all_bookmarks_priority_to_study` took: {end:.4f} seconds, total: {len(sorted_candidates)}" - ) return sorted_candidates @classmethod @@ -301,9 +294,6 @@ def priority_scheduled_bookmarks_to_study(cls, user, limit): 1. Words that are closest to being learned (indicated by `cooling_interval`, the highest the closest it is) 2. Words that are most common in the language (utilizing the word rank in the db) """ - import time - - start = time.time() def priority_by_cooling_interval(bookmark): bookmark_info = bookmark.json_serializable_dict() @@ -320,10 +310,6 @@ def priority_by_cooling_interval(bookmark): sorted_candidates = sorted( no_duplicate_bookmarks, key=lambda x: priority_by_cooling_interval(x) ) - end = time.time() - start - print( - f"### INFO: `priority_scheduled_bookmarks_to_study` took: {end:.4f} seconds" - ) return sorted_candidates @classmethod From 749bd16c5e79336ff0f59635b269c4d8bdff1acb Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Fri, 25 Oct 2024 10:26:17 +0200 Subject: [PATCH 09/71] Updated comments - Removed sorting in priority_scheduled_bookmarks_to_study as now this is done purely in SQL. --- .../core/word_scheduling/basicSR/basicSR.py | 35 ++++++++----------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/zeeguu/core/word_scheduling/basicSR/basicSR.py b/zeeguu/core/word_scheduling/basicSR/basicSR.py index d594da66..9f6909a5 100644 --- a/zeeguu/core/word_scheduling/basicSR/basicSR.py +++ b/zeeguu/core/word_scheduling/basicSR/basicSR.py @@ -204,7 +204,12 @@ def get_scheduled_bookmarks_for_user(cls, user, limit): scheduled_candidates_query = scheduled_candidates_query.filter( Bookmark.learning_cycle == LearningCycle.RECEPTIVE ) - scheduled_candidates_query.order_by(-UserWord.rank.desc(), cls.cooling_interval) + # The scheduled bookmarks are sorted by the most common in the language and + # then by cooling interval, meaning the words that are closest to being learned + # come before the ones that are just learned. + scheduled_candidates_query.order_by( + -UserWord.rank.desc(), cls.cooling_interval.desc() + ) # By using the negative for rank, we ensure NULL is last. if limit is None: return scheduled_candidates_query.all() else: @@ -220,7 +225,9 @@ def get_unscheduled_bookmarks_for_user(cls, user, limit): .join(UserWord, Bookmark.origin_id == UserWord.id) .filter(UserWord.language_id == user.learned_language_id) .filter(BasicSRSchedule.cooling_interval == None) - .order_by(-UserWord.rank.desc()) + .order_by( + -UserWord.rank.desc() + ) # By using the negative for rank, we ensure NULL is last. ) if limit is None: return unscheduled_bookmarks.all() @@ -256,8 +263,9 @@ def all_bookmarks_priority_to_study(cls, user, limit): this method, we do not need to explicitly schedule new words. Currently, we prioritize bookmarks in the following way: - 1. Words that are most common in the language (utilizing the word rank in the db - 2. Words that are closest to being learned (indicated by `cooling_interval`, the highest the closest it is) + 1. Words that are most common in the language (utilizing the word rank in the db + 2. Words that are closest to being learned (indicated by `cooling_interval`, + the highest the closest it is) """ def priority_by_rank(bookmark): @@ -290,27 +298,14 @@ def priority_scheduled_bookmarks_to_study(cls, user, limit): The original logic is kept in bookmarks_to_study as it is called to get similar_words to function as distractors in the exercises. - Currently, we prioritize bookmarks in the following way: - 1. Words that are closest to being learned (indicated by `cooling_interval`, the highest the closest it is) - 2. Words that are most common in the language (utilizing the word rank in the db) + To update the order of bookmarks look at the order_by in + get_scheduled_bookmarks_for_user """ - def priority_by_cooling_interval(bookmark): - bookmark_info = bookmark.json_serializable_dict() - cooling_interval = bookmark_info["cooling_interval"] - cooling_interval = cooling_interval if cooling_interval is not None else -1 - word_rank = bookmark_info["origin_rank"] - if word_rank == "": - word_rank = UserWord.IMPOSSIBLE_RANK - return -cooling_interval, word_rank - scheduled_candidates = cls.get_scheduled_bookmarks_for_user(user, limit) no_duplicate_bookmarks = cls.remove_duplicated_bookmarks(scheduled_candidates) - sorted_candidates = sorted( - no_duplicate_bookmarks, key=lambda x: priority_by_cooling_interval(x) - ) - return sorted_candidates + return no_duplicate_bookmarks @classmethod def bookmarks_to_study(cls, user, required_count): From 2aaf745de55b45e14d51af157e83be338581046d Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Thu, 31 Oct 2024 10:22:42 +0100 Subject: [PATCH 10/71] Update docker-compose.yml - Updated the docker-compose file to use ES8 and to allow developers to run the API without going through apache (logs will be directly onto Docker) - Added the embedding-api --- docker-compose.yml | 55 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 897da149..4a74def0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -30,7 +30,22 @@ services: restart: unless-stopped - + elasticsearch_v8: + image: elasticsearch:8.12.2 + platform: linux/amd64 + # ports: + # - 9200:9200 + # - 9300:9300 + # Ports don't have to be exposed, but it can be nice to debug. + environment: + - discovery.type=single-node + - xpack.security.enabled=false + volumes: + - ./data/elasticsearch_db_v8/data:/usr/share/elasticsearch/data + networks: + - zeeguu_backend + restart: unless-stopped + # mem_limit: 2048m # Useful to restrict the ammount of RAM used by ES. elasticsearch: image: elasticsearch:7.6.2 @@ -50,13 +65,33 @@ services: - zeeguu_backend restart: unless-stopped + embedding_api: + image: zeeguu/semantic-emb-api:main + # deploy: + # resources: + # limits: + # cpus: '0.50' + environment: + SEMANTIC_EMB_API_PORT: 3654 + # ports: + # - 3654:3654 + # Ports don't have to be exposed, but it can be nice to debug. + entrypoint: "python ./semantic-emb-api/app/app.py" + volumes: + - .:/Zeeguu-API + - ./data/zeeguu/language-models:/semantic-emb-api/semantic-emb-api/app/semantic_vector/binaries + networks: + - zeeguu_backend - zapi: + # yaml-anchors: + # https://support.atlassian.com/bitbucket-cloud/docs/yaml-anchors/ + zapi: &zapi_default depends_on: - mysql - fmd_mysql - - elasticsearch + - elasticsearch_v8 - readability_server + - embedding_api image: zeeguu/api:latest build: . ports: @@ -70,7 +105,7 @@ services: # - ./api.cfg:/Zeeguu-API/api.cfg # - ./fmd.cfg:/Zeeguu-API/fmd.cfg # - ./lu-mir-zeeguu-credentials.json:/Zeeguu-API/lu-mir-zeeguu-credentials.json - environment: + environment: &zapi_env ZEEGUU_CONFIG: /Zeeguu-API/api.cfg ZEEGUU_DATA_FOLDER: /zeeguu-data/ SENTRY_DSN: ${SENTRY_DSN} @@ -80,19 +115,27 @@ services: GOOGLE_APPLICATION_CREDENTIALS: /Zeeguu-API/lu-mir-zeeguu-credentials.json WORDNIK_API_KEY: ${WORDNIK_API_KEY} MULTI_LANG_TRANSLATOR_AB_TESTING: ${MULTI_LANG_TRANSLATOR_AB_TESTING} - ZEEGUU_ES_CONN_STRING: 'http://elasticsearch:9200' + ZEEGUU_ES_CONN_STRING: 'http://elasticsearch_v8:9200' FOLDER_FOR_REPORT_OUTPUT: /zeeguu-data/crawl-reports CRAWL_REPORT_DATA: /zeeguu-data/crawl-reports/data + ZEEGUU_EMB_API_CONN_STRING: "http://embedding_api:3654" command: /usr/sbin/apache2ctl -D FOREGROUND networks: - zeeguu_backend restart: unless-stopped + zapi_dev: &zapi_dev + <<: *zapi_default + command: python /Zeeguu-API/start.py + zapi_dev_translations: + <<: *zapi_dev + environment: + <<: *zapi_env + DEV_SKIP_TRANSLATION: 1 # TODO add the zapi_pink back - networks: zeeguu_backend: From 754eb11467f8fd4ef7e7156651e10a94c8a53841 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Thu, 31 Oct 2024 12:29:07 +0100 Subject: [PATCH 11/71] Create migrate_old_topics_to_new_topics.py - Added script with mappings from the old topics to the new ones. --- .../migrate_old_topics_to_new_topics.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 tools/es_v8_migration/migrate_old_topics_to_new_topics.py diff --git a/tools/es_v8_migration/migrate_old_topics_to_new_topics.py b/tools/es_v8_migration/migrate_old_topics_to_new_topics.py new file mode 100644 index 00000000..48f1ba3e --- /dev/null +++ b/tools/es_v8_migration/migrate_old_topics_to_new_topics.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python + +""" + Script to ensure that users have their corresponding topics in the New Topic + mapping. +""" + +import zeeguu.core +from zeeguu.api.app import create_app +from zeeguu.core.model import TopicSubscription, NewTopicSubscription, NewTopic +from tqdm import tqdm + +app = create_app() +app.app_context().push() + +db_session = zeeguu.core.model.db.session + +COMMIT_STEP = 100 # Commit after 100 updates. +VERBOSE = False # Print every update. + +OLD_TOPIC_TO_NEW_TOPIC_MAP = { + 10: 1, # Sport -> Sports + 11: 5, # Health -> Health & Society + 12: 3, # Technology -> Technology & Science + 13: 7, # Politics -> Politics + 14: 3, # Science -> Technology & Science + 15: 2, # Culture -> Culture & Art + 16: 4, # Travel -> Travel & Tourism + # 17 Food was skipped, no exact match + 18: 6, # Business -> Business + 19: 8, # Satire -> Satire, + 20: 2, # Music -> Culture & Art + 21: 5, # Social Sciences -> Health & Society + # 22 World was skipped + # 23 Internet was skipped + 24: 3, # Knowledge -> Technology & Science +} + + +# languages = Language.available_languages() +print("Getting all current topics (old) subscriptions for users: ") +current_topics = db_session.query(TopicSubscription).all() + +for i, topic_sub in tqdm(enumerate(current_topics), total=len(current_topics)): + user = topic_sub.user + old_topic = topic_sub.topic + new_topic_id = OLD_TOPIC_TO_NEW_TOPIC_MAP.get(old_topic.id, None) + if new_topic_id: + new_topic = NewTopic.find_by_id(new_topic_id) + new_user_sub = NewTopicSubscription.find_or_create(db_session, user, new_topic) + if VERBOSE: + print( + f"User {user.id}, was subscribed to '{old_topic.title}' and now is subscribed to: '{new_topic.title}'" + ) + if i % COMMIT_STEP == 0: + if VERBOSE: + print("Commiting...") + db_session.commit() +db_session.commit() +if VERBOSE: + print("End updating users...") From 06889613ab179f8f8e4d76a05dc0be2d3fa0bc5b Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Thu, 31 Oct 2024 15:11:21 +0100 Subject: [PATCH 12/71] Check available topics for user language - Added a caching so it's updated once every week. - If no language is passed, all topics are returned (previous behaviour). --- zeeguu/api/endpoints/topics.py | 8 ++++---- zeeguu/core/model/new_topic.py | 37 ++++++++++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/zeeguu/api/endpoints/topics.py b/zeeguu/api/endpoints/topics.py index 633a2b4a..50c9a69d 100644 --- a/zeeguu/api/endpoints/topics.py +++ b/zeeguu/api/endpoints/topics.py @@ -243,13 +243,13 @@ def get_available_new_topics(): topic_data = [] user = User.find_by_id(flask.g.user_id) already_subscribed = [ - each.new_topic for each in NewTopicSubscription.all_for_user(user) + each.new_topic.id for each in NewTopicSubscription.all_for_user(user) ] - - topics = NewTopic.get_all_topics() + user_learning_language = Language.find_by_id(user.learned_language_id) + topics = NewTopic.get_all_topics(user_learning_language) for topic in topics: - if topic not in already_subscribed: + if topic.id not in already_subscribed: topic_data.append(topic.as_dictionary()) return json_result(topic_data) diff --git a/zeeguu/core/model/new_topic.py b/zeeguu/core/model/new_topic.py index 8b9423b4..2a4a9ac7 100644 --- a/zeeguu/core/model/new_topic.py +++ b/zeeguu/core/model/new_topic.py @@ -3,6 +3,9 @@ from sqlalchemy import Column, Integer, String from sqlalchemy.orm import relationship from zeeguu.core.model import db +from zeeguu.core.model.language import Language +from zeeguu.core.model.new_article_topic_map import NewArticleTopicMap +from zeeguu.core.util.time import get_server_time_utc class NewTopic(db.Model): @@ -22,6 +25,7 @@ class NewTopic(db.Model): title = Column(String(64)) articles = relationship("NewArticleTopicMap", back_populates="new_topic") + language_topic_available_cache = {} def __init__(self, title): self.title = title @@ -86,5 +90,34 @@ def find_by_id(cls, i): return None @classmethod - def get_all_topics(cls): - return NewTopic.query.order_by(NewTopic.title).all() + def get_all_topics(cls, language: Language = None): + from zeeguu.core.model.article import Article + + def update_available_topic_cache(): + topics_for_language = ( + NewTopic.query.join(NewArticleTopicMap) + .join(Article) + .filter(Article.language_id == language.id) + .distinct(NewTopic.id) + .all() + ) + cls.language_topic_available_cache[language.id] = ( + topics_for_language, + get_server_time_utc(), + ) + + if language is None: + return NewTopic.query.order_by(NewTopic.title).all() + topics_available, last_check = cls.language_topic_available_cache.get( + language.id, (None, None) + ) + + if last_check is None: + update_available_topic_cache() + else: + time_since_last_check = get_server_time_utc() - last_check + if time_since_last_check.days > 7: + update_available_topic_cache() + + topics_available = cls.language_topic_available_cache[language.id][0] + return topics_available From bfc15d00fa80ae4b228ca2461946eebb5ee5c353 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Fri, 1 Nov 2024 09:34:44 +0100 Subject: [PATCH 13/71] Update mysql_to_elastic_new_topics.py - Reduced the number of indexed documents for developer ease. - Improved filtering of existing documents in ES by usage of scan. E.g.: ## Before: Total articles missing: 238244 Total articles missing: 238244, filtered in 262.06 seconds. ## After: Got articles with topics, total: 244631 Total articles missing: 238234, filtered in 4.27 seconds. --- tools/mysql_to_elastic_new_topics.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/tools/mysql_to_elastic_new_topics.py b/tools/mysql_to_elastic_new_topics.py index 6ae0ccdd..a58a07aa 100644 --- a/tools/mysql_to_elastic_new_topics.py +++ b/tools/mysql_to_elastic_new_topics.py @@ -5,8 +5,8 @@ create_or_update_bulk_docs, ) from sqlalchemy import func -from elasticsearch import Elasticsearch, helpers -from elasticsearch.helpers import bulk +from elasticsearch import Elasticsearch +from elasticsearch.helpers import bulk, scan import zeeguu.core from zeeguu.core.model import Article from datetime import datetime @@ -19,6 +19,7 @@ from zeeguu.core.model.new_article_topic_map import TopicOriginType import numpy as np from tqdm import tqdm +import time app = create_app() app.app_context().push() @@ -37,8 +38,8 @@ # ITERATION_STEP - number of articles to index before reporting. Default: 1000 DELETE_INDEX = False INDEX_WITH_TOPIC_ONLY = True -TOTAL_ITEMS = 5000 -ITERATION_STEP = 1000 +TOTAL_ITEMS = 1000 +ITERATION_STEP = 10 print(ES_CONN_STRING) es = Elasticsearch(ES_CONN_STRING) @@ -127,10 +128,16 @@ def gen_docs(articles_w_topics): if len(target_ids) == 0: print("No articles found! Exiting...") return - target_ids_not_in_es = list( - filter(lambda x: not es.exists(index=ES_ZINDEX, id=x), target_ids) + start = time.time() + es_query = {"query": {"match_all": {}}} + ids_in_es = set( + [int(hit["_id"]) for hit in scan(es, index=ES_ZINDEX, query=es_query)] + ) + target_ids_not_in_es = list(filter(lambda x: x not in ids_in_es, target_ids)) + end = time.time() - start + print( + f"Total articles missing: {len(target_ids_not_in_es)}, filtered in {end:.2f} seconds." ) - print("Total articles missing: ", len(target_ids_not_in_es)) # I noticed that if a document is not added then it won't let me query the ES search. total_added = 0 From 953aab9b185534711d2a3c47d7e20bf4df7fcfdc Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Fri, 1 Nov 2024 16:12:45 +0100 Subject: [PATCH 14/71] Update docker-compose.yml - Limit the memory for ES and Embedding API These changes aim to make the docker load be lighter on the developer as they have more limited resources. All these changes result on about 4.41GBs load in my computer. --- docker-compose.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 4a74def0..0c01b9a9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -45,7 +45,7 @@ services: networks: - zeeguu_backend restart: unless-stopped - # mem_limit: 2048m # Useful to restrict the ammount of RAM used by ES. + mem_limit: 512m # Useful to restrict the ammount of RAM used by ES. elasticsearch: image: elasticsearch:7.6.2 @@ -57,6 +57,7 @@ services: networks: - zeeguu_backend restart: unless-stopped + mem_limit: 512m readability_server: @@ -82,6 +83,12 @@ services: - ./data/zeeguu/language-models:/semantic-emb-api/semantic-emb-api/app/semantic_vector/binaries networks: - zeeguu_backend + # When encoding certain longer documents, it might use more than + # the available memory allocated to the container, in that case restart the service. + mem_limit: 2512m # From testing usially the container needs about 2GBs to run + # However, as a Dev safeguard, if needed the container is restarted to avoid + # taking too much memory from the host. + restart: unless-stopped # yaml-anchors: # https://support.atlassian.com/bitbucket-cloud/docs/yaml-anchors/ From a7b15a0aa70690cebb73b1612111d9c4c32a68e3 Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Sun, 3 Nov 2024 20:50:05 +0100 Subject: [PATCH 15/71] printing recent users by teacdhr --- tools/old/users_recently_active.py | 19 ------------------ tools/users_recently_active.py | 32 ++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 19 deletions(-) delete mode 100755 tools/old/users_recently_active.py create mode 100755 tools/users_recently_active.py diff --git a/tools/old/users_recently_active.py b/tools/old/users_recently_active.py deleted file mode 100755 index 927d8e5e..00000000 --- a/tools/old/users_recently_active.py +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env python - -""" - - Script that lists recent users - - To be called from a cron job. - -""" - -from zeeguu.core.model import User - -for user_id in User.all_recent_user_ids(): - user = User.find_by_id(user_id) - print (user.name) - print (user.email) - - - diff --git a/tools/users_recently_active.py b/tools/users_recently_active.py new file mode 100755 index 00000000..76923391 --- /dev/null +++ b/tools/users_recently_active.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python + +""" + + Script that lists recent users + +""" + +from zeeguu.api.app import create_app + +app = create_app() +app.app_context().push() + +from collections import defaultdict + +teacher_student_map = defaultdict(list) + +from zeeguu.core.model import User + +for user_id in User.all_recent_user_ids(): + user = User.find_by_id(user_id) + # print(f"{user.name} ({user.email})") + for ucmap in user.cohorts: + # print(f"{ucmap.cohort.name}") + for teacher in ucmap.cohort.get_teachers(): + # print(f"{teacher.name}") + teacher_student_map[teacher].append(user.name) + + # print("") +for key, values in teacher_student_map.items(): + for v in values: + print(key.email, " : ", v) From d7b3c25de4286adff864f8054056aa80146758fa Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Sun, 3 Nov 2024 21:01:16 +0100 Subject: [PATCH 16/71] nicer printing of students in classes --- tools/users_recently_active.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tools/users_recently_active.py b/tools/users_recently_active.py index 76923391..14c6dbf1 100755 --- a/tools/users_recently_active.py +++ b/tools/users_recently_active.py @@ -13,7 +13,7 @@ from collections import defaultdict -teacher_student_map = defaultdict(list) +cohort_student_map = defaultdict(list) from zeeguu.core.model import User @@ -22,11 +22,18 @@ # print(f"{user.name} ({user.email})") for ucmap in user.cohorts: # print(f"{ucmap.cohort.name}") - for teacher in ucmap.cohort.get_teachers(): - # print(f"{teacher.name}") - teacher_student_map[teacher].append(user.name) + # print(f"{teacher.name}") + cohort_student_map[ucmap.cohort].append(user.name) # print("") -for key, values in teacher_student_map.items(): +for cohort, values in cohort_student_map.items(): + print(f"============================") + print(f"{cohort.name} ({cohort.language.code if cohort.language else ''})") + print(f"============================") + + for teacher in cohort.get_teachers(): + print(f" {teacher.name} ({teacher.email})") for v in values: - print(key.email, " : ", v) + print(" - ", v) + + print(" ") From d16092917e568608e54f5a89d6afcbb8d20ac49f Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Mon, 4 Nov 2024 12:29:37 +0100 Subject: [PATCH 17/71] toggled new_topics for all --- zeeguu/api/endpoints/feature_toggles.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/zeeguu/api/endpoints/feature_toggles.py b/zeeguu/api/endpoints/feature_toggles.py index f0242633..b640e435 100644 --- a/zeeguu/api/endpoints/feature_toggles.py +++ b/zeeguu/api/endpoints/feature_toggles.py @@ -55,13 +55,7 @@ def _feature_map(): def _new_topics(user): - right_user = ( - user.id == 534 - or user.id == 4022 - or user.id == 4089 - or user.invitation_code == "zeeguu-preview" - ) - return right_user + return True def _tiago_exercises(user): From 153281b8a0e36b504602643f5f58e2e325fe9a53 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Mon, 4 Nov 2024 14:47:28 +0100 Subject: [PATCH 18/71] Added new topics to url_keywords --- ...24-11-04--updating-url-keyword-mapping.sql | 350 ++++++++++++++++++ 1 file changed, 350 insertions(+) create mode 100644 tools/migrations/24-11-04--updating-url-keyword-mapping.sql diff --git a/tools/migrations/24-11-04--updating-url-keyword-mapping.sql b/tools/migrations/24-11-04--updating-url-keyword-mapping.sql new file mode 100644 index 00000000..b607dba7 --- /dev/null +++ b/tools/migrations/24-11-04--updating-url-keyword-mapping.sql @@ -0,0 +1,350 @@ +# Swedish 'ekonomi' -> Business +update + url_keyword +set + new_topic_id = 6 +WHERE + keyword like 'ekonomi' + and language_id = 18; + +# German 'kunstmarkt' -> Culture & Art +update + url_keyword +set + new_topic_id = 2 +WHERE + keyword like 'kunstmarkt' + and language_id = 3; + +# German 'wintersport' -> Sports +update + url_keyword +set + new_topic_id = 1 +WHERE + keyword like 'wintersport' + and language_id = 3; + +# German 'technik' -> Technology & Science +update + url_keyword +set + new_topic_id = 3 +WHERE + keyword like 'technik' + and language_id = 3; + +# German 'buehne und konzert' -> Culture & Art +update + url_keyword +set + new_topic_id = 2 +WHERE + keyword like 'patrimoine' + and language_id = 3; + +# German 'bildung' -> Health & Society +update + url_keyword +set + new_topic_id = 5 +WHERE + keyword like 'bildung' + and language_id = 3; + +# German 'mode design' -> Culture & Art ? +update + url_keyword +set + new_topic_id = 2 +WHERE + keyword like 'mode design' + and language_id = 3; + +# Danish 'krop sundhed' -> Health & Society +update + url_keyword +set + new_topic_id = 5 +WHERE + keyword like 'krop sundhed' + and language_id = 2; + +# Danish 'naturvidenskab' -> Technology & Science +update + url_keyword +set + new_topic_id = 3 +WHERE + keyword like 'naturvidenskab' + and language_id = 2; + +# Danish 'boganmeldelser' -> Culture & Art +update + url_keyword +set + new_topic_id = 2 +WHERE + keyword like 'boganmeldelser' + and language_id = 2; + +# Danish 'wimbledon' -> Sports +update + url_keyword +set + new_topic_id = 1 +WHERE + keyword like 'wimbledon' + and language_id = 2; + +# Danish 'ep valg' -> Politics +update + url_keyword +set + new_topic_id = 7 +WHERE + keyword like 'ep valg' + and language_id = 2; + +# Danish 'badminton' -> Sports +update + url_keyword +set + new_topic_id = 1 +WHERE + keyword like 'badminton' + and language_id = 2; + +# Danish 'superliga' -> Sports +update + url_keyword +set + new_topic_id = 1 +WHERE + keyword like 'superliga' + and language_id = 2; + +# Danish 'uddannelse' -> Health & Society +update + url_keyword +set + new_topic_id = 5 +WHERE + keyword like 'uddannelse' + and language_id = 2; + +# Danish 'kvindelandsholdet' -> Sports +update + url_keyword +set + new_topic_id = 1 +WHERE + keyword like 'kvindelandsholdet' + and language_id = 2; + +# Danish 'rummet' -> Technology & Science +update + url_keyword +set + new_topic_id = 3 +WHERE + keyword like 'rummet' + and language_id = 2; + +# Danish 'erhverv' -> Business +update + url_keyword +set + new_topic_id = 6 +WHERE + keyword like 'erhverv' + and language_id = 2; + +# Danish 'sundhed' -> Health & Society +update + url_keyword +set + new_topic_id = 5 +WHERE + keyword like 'sundhed' + and language_id = 2; + +# Danish 'valg i usa' -> Politics +update + url_keyword +set + new_topic_id = 7 +WHERE + keyword like 'valg i usa' + and language_id = 2; + +# Danish 'teknologi' -> Technology & Science +update + url_keyword +set + new_topic_id = 3 +WHERE + keyword like 'teknologi' + and language_id = 2; + +# French 'sport auto' -> Sports +update + url_keyword +set + new_topic_id = 1 +WHERE + keyword like 'sport auto' + and language_id = 7; + +# French 'entreprises' -> Business +update + url_keyword +set + new_topic_id = 6 +WHERE + keyword like 'entreprises' + and language_id = 7; + +# French 'economie francaise' -> Business +update + url_keyword +set + new_topic_id = 6 +WHERE + keyword like 'economie francaise' + and language_id = 7; + +# French 'tennis de table' -> Sports +update + url_keyword +set + new_topic_id = 1 +WHERE + keyword like 'tennis de table' + and language_id = 7; + +# French 'judo' -> Sports +update + url_keyword +set + new_topic_id = 1 +WHERE + keyword like 'judo' + and language_id = 7; + +# French 'surf' -> Sports +update + url_keyword +set + new_topic_id = 1 +WHERE + keyword like 'surf' + and language_id = 7; + +# French 'histoire' -> Culture & Art +update + url_keyword +set + new_topic_id = 2 +WHERE + keyword like 'histoire' + and language_id = 7; + +# French 'patrimoine' -> Culture & Art +update + url_keyword +set + new_topic_id = 2 +WHERE + keyword like 'patrimoine' + and language_id = 7; + +# French 'gastronomie' -> Travel & Tourism +update + url_keyword +set + new_topic_id = 4 +WHERE + keyword like 'gastronomie' + and language_id = 7; + +# French 'rallye' -> Sports +update + url_keyword +set + new_topic_id = 1 +WHERE + keyword like 'rallye' + and language_id = 7; + +# French 'theatre' -> Culture & Art +update + url_keyword +set + new_topic_id = 2 +WHERE + keyword like 'theatre' + and language_id = 7; + +# French 'cultures pop' -> Culture & Art +update + url_keyword +set + new_topic_id = 2 +WHERE + keyword like 'cultures pop' + and language_id = 7; + +# French 'festival de cannes' -> Culture & Art +update + url_keyword +set + new_topic_id = 2 +WHERE + keyword like 'festival de cannes' + and language_id = 7; + +# French 'festival de cannes' -> Culture & Art +update + url_keyword +set + new_topic_id = 2 +WHERE + keyword like 'festival de cannes' + and language_id = 7; + +# French 'jeux olympiques' -> Sports +update + url_keyword +set + new_topic_id = 1 +WHERE + keyword like 'jeux olympiques' + and language_id = 7; + +# French 'volley ball' -> Sports +update + url_keyword +set + new_topic_id = 1 +WHERE + keyword like 'volley ball' + and language_id = 7; + +# Italian 'casa' -> Health & Society +update + url_keyword +set + new_topic_id = 5 +WHERE + keyword like 'casa' + and language_id = 8; + +# Italian 'formulauno' -> Sports +update + url_keyword +set + new_topic_id = 1 +WHERE + keyword like 'formulauno' + and language_id = 8; \ No newline at end of file From 45e6af03b6f3703f04727f15f6a0396ee78335a1 Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Tue, 5 Nov 2024 12:22:09 +0100 Subject: [PATCH 19/71] Update activity_tracking.py --- zeeguu/api/endpoints/activity_tracking.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/zeeguu/api/endpoints/activity_tracking.py b/zeeguu/api/endpoints/activity_tracking.py index 2dcb716e..d8c6046b 100644 --- a/zeeguu/api/endpoints/activity_tracking.py +++ b/zeeguu/api/endpoints/activity_tracking.py @@ -44,3 +44,22 @@ def upload_user_activity_data(): ZeeguuMailer.notify_audio_experiment(request.form, user) return "OK" + + +@api.route("/days_since_last_use", methods=["POST"]) +@cross_domain +@requires_session +def days_since_last_use(): + + from sortedcontainers import SortedList + from datetime import datetime + + activity_data = UserActivityData.find(flask.g.user) + + ordered_dates = SortedList(activity_data, key=lambda x: x.time) + + current_date = datetime.now() + time_difference = current_date - ordered_dates[-1] + days = time_difference.days + + return days From 36b16f47214316c5ae6393c0911b6ed0c6c2b39d Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Tue, 5 Nov 2024 17:00:39 +0100 Subject: [PATCH 20/71] better matching of password filed --- zeeguu/api/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zeeguu/api/app.py b/zeeguu/api/app.py index 94e2f06f..e50df01d 100644 --- a/zeeguu/api/app.py +++ b/zeeguu/api/app.py @@ -82,7 +82,7 @@ def create_app(testing=False): # Log the DB connection string; after masking the password db_connection_string = app.config["SQLALCHEMY_DATABASE_URI"] anon_conn_string = re.sub( - ":([a-zA-Z_][a-zA-Z_0-9]*)@", ":****@", db_connection_string + ":([a-zA-Z_][a-zA-Z_0-9\-]*)@", ":****@", db_connection_string, 8 ) warning("*** ==== ZEEGUU CORE: Linked model with: " + anon_conn_string) From 6e3c21a036b17c8cf2b552bdbe75605114673f11 Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Tue, 5 Nov 2024 17:04:55 +0100 Subject: [PATCH 21/71] not printing app config anymore --- zeeguu/api/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zeeguu/api/app.py b/zeeguu/api/app.py index e50df01d..4a79e696 100644 --- a/zeeguu/api/app.py +++ b/zeeguu/api/app.py @@ -78,7 +78,7 @@ def create_app(testing=False): # We're saving the zeeguu.core.app so we can refer to the config from deep in the code... zeeguu.core.app = app - print(app.config) + # print(app.config) # Log the DB connection string; after masking the password db_connection_string = app.config["SQLALCHEMY_DATABASE_URI"] anon_conn_string = re.sub( From 321350edf853c61581d9592313cb6af4cfb96064 Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Tue, 5 Nov 2024 17:05:55 +0100 Subject: [PATCH 22/71] not printing app config anymore and anonymizing the password --- zeeguu/api/app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zeeguu/api/app.py b/zeeguu/api/app.py index 94e2f06f..4a79e696 100644 --- a/zeeguu/api/app.py +++ b/zeeguu/api/app.py @@ -78,11 +78,11 @@ def create_app(testing=False): # We're saving the zeeguu.core.app so we can refer to the config from deep in the code... zeeguu.core.app = app - print(app.config) + # print(app.config) # Log the DB connection string; after masking the password db_connection_string = app.config["SQLALCHEMY_DATABASE_URI"] anon_conn_string = re.sub( - ":([a-zA-Z_][a-zA-Z_0-9]*)@", ":****@", db_connection_string + ":([a-zA-Z_][a-zA-Z_0-9\-]*)@", ":****@", db_connection_string, 8 ) warning("*** ==== ZEEGUU CORE: Linked model with: " + anon_conn_string) From d28a8473a1296f28520338d522890cc6501daebb Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Tue, 5 Nov 2024 17:06:51 +0100 Subject: [PATCH 23/71] not printing app config anymore and anonymizing the password --- zeeguu/api/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zeeguu/api/app.py b/zeeguu/api/app.py index 4a79e696..db2775e2 100644 --- a/zeeguu/api/app.py +++ b/zeeguu/api/app.py @@ -82,7 +82,7 @@ def create_app(testing=False): # Log the DB connection string; after masking the password db_connection_string = app.config["SQLALCHEMY_DATABASE_URI"] anon_conn_string = re.sub( - ":([a-zA-Z_][a-zA-Z_0-9\-]*)@", ":****@", db_connection_string, 8 + ":([a-zA-Z_][a-zA-Z_0-9\-]*)@", ":****@", db_connection_string ) warning("*** ==== ZEEGUU CORE: Linked model with: " + anon_conn_string) From 4b5a414018fb4f9696a7603f9f449b07bfca32b0 Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Tue, 5 Nov 2024 17:08:53 +0100 Subject: [PATCH 24/71] not printing app config anymore --- zeeguu/config/loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/zeeguu/config/loader.py b/zeeguu/config/loader.py index e73a65c8..2c7fcca8 100644 --- a/zeeguu/config/loader.py +++ b/zeeguu/config/loader.py @@ -32,7 +32,6 @@ def load_configuration_or_abort(app, environ_variable, mandatory_config_keys=[]) app.config.from_pyfile(config_file, silent=False) _assert_configs(app.config, mandatory_config_keys, config_file) print(("ZEEGUU: Loaded {0} config from {1}".format(app.name, config_file))) - print(app.config) except Exception as e: print(str(e)) exit(-1) From 0bfb078498808e59c5382b703ac9121e1ae91826 Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Tue, 5 Nov 2024 17:40:39 +0100 Subject: [PATCH 25/71] smarter query --- zeeguu/api/endpoints/activity_tracking.py | 17 +++++++++-------- zeeguu/core/model/user_activitiy_data.py | 13 +++++++++++++ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/zeeguu/api/endpoints/activity_tracking.py b/zeeguu/api/endpoints/activity_tracking.py index d8c6046b..0ed61a6b 100644 --- a/zeeguu/api/endpoints/activity_tracking.py +++ b/zeeguu/api/endpoints/activity_tracking.py @@ -50,16 +50,17 @@ def upload_user_activity_data(): @cross_domain @requires_session def days_since_last_use(): + """ + Returns the number of days since the last user activity event + or -1 in case there is no user activity event. + """ - from sortedcontainers import SortedList from datetime import datetime - activity_data = UserActivityData.find(flask.g.user) - - ordered_dates = SortedList(activity_data, key=lambda x: x.time) + last_active_time = UserActivityData.get_last_activity_time(flask.g.user) - current_date = datetime.now() - time_difference = current_date - ordered_dates[-1] - days = time_difference.days + if last_active_time: + time_difference = datetime.now() - last_active_time + return time_difference.days - return days + return "-1" diff --git a/zeeguu/core/model/user_activitiy_data.py b/zeeguu/core/model/user_activitiy_data.py index cbf0b2cb..0ff12f25 100644 --- a/zeeguu/core/model/user_activitiy_data.py +++ b/zeeguu/core/model/user_activitiy_data.py @@ -3,6 +3,7 @@ from time import sleep from sqlalchemy import Column, String, Integer, Boolean, DateTime, ForeignKey +from sqlalchemy.exc import NoResultFound from sqlalchemy.orm import relationship from zeeguu.core.model.user_reading_session import ALL_ARTICLE_INTERACTION_ACTIONS @@ -366,3 +367,15 @@ def create_from_post_data(cls, session, data, user): session.add(new_entry) session.commit() + + @classmethod + def get_last_activity_date(cls, user): + + query = cls.query.filter(cls.user_id == user.id) + query = query.order_by(cls.id.desc()).limit(1) + + last_event = query.first() + if last_event: + return last_event.time + + return None From e5366b3a34ccdcfb89894e6e7757dda80d24625d Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Tue, 5 Nov 2024 17:41:08 +0100 Subject: [PATCH 26/71] Revert "not printing app config anymore" This reverts commit 6e3c21a036b17c8cf2b552bdbe75605114673f11. --- zeeguu/api/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zeeguu/api/app.py b/zeeguu/api/app.py index 4a79e696..e50df01d 100644 --- a/zeeguu/api/app.py +++ b/zeeguu/api/app.py @@ -78,7 +78,7 @@ def create_app(testing=False): # We're saving the zeeguu.core.app so we can refer to the config from deep in the code... zeeguu.core.app = app - # print(app.config) + print(app.config) # Log the DB connection string; after masking the password db_connection_string = app.config["SQLALCHEMY_DATABASE_URI"] anon_conn_string = re.sub( From f8cfe3b6122278ada39707d8fdeb95fef6e3eefa Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Tue, 5 Nov 2024 17:41:13 +0100 Subject: [PATCH 27/71] Revert "better matching of password filed" This reverts commit 36b16f47214316c5ae6393c0911b6ed0c6c2b39d. --- zeeguu/api/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zeeguu/api/app.py b/zeeguu/api/app.py index e50df01d..94e2f06f 100644 --- a/zeeguu/api/app.py +++ b/zeeguu/api/app.py @@ -82,7 +82,7 @@ def create_app(testing=False): # Log the DB connection string; after masking the password db_connection_string = app.config["SQLALCHEMY_DATABASE_URI"] anon_conn_string = re.sub( - ":([a-zA-Z_][a-zA-Z_0-9\-]*)@", ":****@", db_connection_string, 8 + ":([a-zA-Z_][a-zA-Z_0-9]*)@", ":****@", db_connection_string ) warning("*** ==== ZEEGUU CORE: Linked model with: " + anon_conn_string) From cb33f03537a42ab5a734f30637d81b2633459018 Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Tue, 5 Nov 2024 17:42:41 +0100 Subject: [PATCH 28/71] smarter query --- zeeguu/api/endpoints/activity_tracking.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zeeguu/api/endpoints/activity_tracking.py b/zeeguu/api/endpoints/activity_tracking.py index 0ed61a6b..328ac4a7 100644 --- a/zeeguu/api/endpoints/activity_tracking.py +++ b/zeeguu/api/endpoints/activity_tracking.py @@ -52,7 +52,7 @@ def upload_user_activity_data(): def days_since_last_use(): """ Returns the number of days since the last user activity event - or -1 in case there is no user activity event. + or an empty string in case there is no user activity event. """ from datetime import datetime @@ -63,4 +63,4 @@ def days_since_last_use(): time_difference = datetime.now() - last_active_time return time_difference.days - return "-1" + return "" From f4c32ac0f31627fce91143eb0b75db882d7086d7 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Wed, 6 Nov 2024 09:26:52 +0100 Subject: [PATCH 29/71] Update test.yml - Removed the requirements v8 as it doesn't exist anymore. --- .github/workflows/test.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6f8ca295..d6552cdd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -23,8 +23,7 @@ jobs: run: | python -m pip install --upgrade pip pip install pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - if [ -f requirements_v8.txt ]; then pip install --force-reinstall -r requirements_v8.txt; fi + pip install -r requirements.txt python setup.py develop - name: Test with pytest run: | From 3d33785c78fd076517a03b36840823fbf3da8425 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Wed, 6 Nov 2024 09:57:34 +0100 Subject: [PATCH 30/71] Update requirements.txt Fix to failing automated branch tests. --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 7594e6bb..470129c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,7 @@ elasticsearch-dsl==8.12.0 Faker feedparser flask==2.3.2 +werkzeug==3.0.2 Flask-Assets flask_cors flask_sqlalchemy>=3.0 From 26071fa72958483a6a845e9e319d9027c7b7da26 Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Wed, 6 Nov 2024 09:57:55 +0100 Subject: [PATCH 31/71] should be get not post :) --- zeeguu/api/endpoints/activity_tracking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zeeguu/api/endpoints/activity_tracking.py b/zeeguu/api/endpoints/activity_tracking.py index 328ac4a7..ac0e4dd0 100644 --- a/zeeguu/api/endpoints/activity_tracking.py +++ b/zeeguu/api/endpoints/activity_tracking.py @@ -46,7 +46,7 @@ def upload_user_activity_data(): return "OK" -@api.route("/days_since_last_use", methods=["POST"]) +@api.route("/days_since_last_use", methods=["GET"]) @cross_domain @requires_session def days_since_last_use(): From 8dabdb9a79d469f0ca60930695c660dd46a9863e Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Wed, 6 Nov 2024 10:08:35 +0100 Subject: [PATCH 32/71] rename --- zeeguu/api/endpoints/activity_tracking.py | 2 +- zeeguu/core/model/user_activitiy_data.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/zeeguu/api/endpoints/activity_tracking.py b/zeeguu/api/endpoints/activity_tracking.py index ac0e4dd0..68995501 100644 --- a/zeeguu/api/endpoints/activity_tracking.py +++ b/zeeguu/api/endpoints/activity_tracking.py @@ -57,7 +57,7 @@ def days_since_last_use(): from datetime import datetime - last_active_time = UserActivityData.get_last_activity_time(flask.g.user) + last_active_time = UserActivityData.get_last_activity_timestamp(flask.g.user) if last_active_time: time_difference = datetime.now() - last_active_time diff --git a/zeeguu/core/model/user_activitiy_data.py b/zeeguu/core/model/user_activitiy_data.py index 0ff12f25..b199be03 100644 --- a/zeeguu/core/model/user_activitiy_data.py +++ b/zeeguu/core/model/user_activitiy_data.py @@ -369,7 +369,7 @@ def create_from_post_data(cls, session, data, user): session.commit() @classmethod - def get_last_activity_date(cls, user): + def get_last_activity_timestamp(cls, user): query = cls.query.filter(cls.user_id == user.id) query = query.order_by(cls.id.desc()).limit(1) From c5d4dcd2c42dff05350577f65460b4c341518fc7 Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Wed, 6 Nov 2024 10:10:30 +0100 Subject: [PATCH 33/71] using user_id because it seems flask.g.user is not a thing --- zeeguu/api/endpoints/activity_tracking.py | 2 +- zeeguu/core/model/user_activitiy_data.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/zeeguu/api/endpoints/activity_tracking.py b/zeeguu/api/endpoints/activity_tracking.py index 68995501..cbbe7e52 100644 --- a/zeeguu/api/endpoints/activity_tracking.py +++ b/zeeguu/api/endpoints/activity_tracking.py @@ -57,7 +57,7 @@ def days_since_last_use(): from datetime import datetime - last_active_time = UserActivityData.get_last_activity_timestamp(flask.g.user) + last_active_time = UserActivityData.get_last_activity_timestamp(flask.g.user_id) if last_active_time: time_difference = datetime.now() - last_active_time diff --git a/zeeguu/core/model/user_activitiy_data.py b/zeeguu/core/model/user_activitiy_data.py index b199be03..d46b4682 100644 --- a/zeeguu/core/model/user_activitiy_data.py +++ b/zeeguu/core/model/user_activitiy_data.py @@ -369,9 +369,9 @@ def create_from_post_data(cls, session, data, user): session.commit() @classmethod - def get_last_activity_timestamp(cls, user): + def get_last_activity_timestamp(cls, user_id): - query = cls.query.filter(cls.user_id == user.id) + query = cls.query.filter(cls.user_id == user_id) query = query.order_by(cls.id.desc()).limit(1) last_event = query.first() From cbe7dc6c3dc746c887de6ad652d655720d14e54f Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Wed, 6 Nov 2024 10:15:49 +0100 Subject: [PATCH 34/71] using user_id because it seems flask.g.user is not a thing --- zeeguu/api/endpoints/activity_tracking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zeeguu/api/endpoints/activity_tracking.py b/zeeguu/api/endpoints/activity_tracking.py index cbbe7e52..4aa64021 100644 --- a/zeeguu/api/endpoints/activity_tracking.py +++ b/zeeguu/api/endpoints/activity_tracking.py @@ -61,6 +61,6 @@ def days_since_last_use(): if last_active_time: time_difference = datetime.now() - last_active_time - return time_difference.days + return str(time_difference.days) return "" From fbc85d3a0484c97feed5fbd7578c45a101ac3fa1 Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Wed, 6 Nov 2024 16:56:20 +0100 Subject: [PATCH 35/71] changed order of building of the zeeguu/api image; python setup at the end. --- Dockerfile | 67 +++++++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3350963e..f3cbaee1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,35 +23,6 @@ RUN apt-get install -y mysql\* # https://stackoverflow.com/questions/5178292/pip-install-mysql-python-fails-with-environmenterror-mysql-config-not-found RUN apt-get install -y default-libmysqlclient-dev -# Zeeguu-Api -# ---------- - -# Declare that this will be mounted from a volume -VOLUME /Zeeguu-API - -# We need to copy the requirements file it in order to be able to install it -# However, we're not copying the whole folder, such that in case we make a change in the folder -# (e.g. to this build file) the whole cache is not invalidated and the build process does -# not have to start from scratch -RUN mkdir /Zeeguu-API -COPY ./requirements.txt /Zeeguu-API/requirements.txt -COPY ./setup.py /Zeeguu-API/setup.py - -# Install requirements and setup -WORKDIR /Zeeguu-API - -RUN python -m pip install -r requirements.txt -RUN python setup.py develop - -# Copy the rest of the files -# (this is done after the requirements are installed, so that the cache is not invalidated) -WORKDIR /Zeeguu-API -COPY . /Zeeguu-API - -ENV ZEEGUU_CONFIG=/Zeeguu-API/default_docker.cfg - -VOLUME /zeeguu-data - # mysql CL client # ------------------------- @@ -101,11 +72,45 @@ RUN echo '\n\ CustomLog ${APACHE_LOG_DIR}/access.log combined\n\ ' > /etc/apache2/sites-available/zeeguu-api.conf -RUN a2dissite 000-default.conf -RUN a2ensite zeeguu-api RUN chown -R www-data:www-data /var/www # have apache listen on port 8080 RUN sed -i "s,Listen 80,Listen 8080,g" /etc/apache2/ports.conf + + +# Zeeguu-Api +# ---------- + +# Declare that this will be mounted from a volume +VOLUME /Zeeguu-API + +# We need to copy the requirements file it in order to be able to install it +# However, we're not copying the whole folder, such that in case we make a change in the folder +# (e.g. to this build file) the whole cache is not invalidated and the build process does +# not have to start from scratch +RUN mkdir /Zeeguu-API +COPY ./requirements.txt /Zeeguu-API/requirements.txt +COPY ./setup.py /Zeeguu-API/setup.py + +# Install requirements and setup +WORKDIR /Zeeguu-API + +RUN python -m pip install -r requirements.txt +RUN python setup.py develop + +# Copy the rest of the files +# (this is done after the requirements are installed, so that the cache is not invalidated) +WORKDIR /Zeeguu-API +COPY . /Zeeguu-API + +ENV ZEEGUU_CONFIG=/Zeeguu-API/default_docker.cfg + +VOLUME /zeeguu-data + + +RUN a2dissite 000-default.conf +RUN a2ensite zeeguu-api + + From 99697316c00dadacd19a3af1713c642c1a6b3146 Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Wed, 6 Nov 2024 17:04:20 +0100 Subject: [PATCH 36/71] logging endpoint --- zeeguu/api/utils/route_wrappers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zeeguu/api/utils/route_wrappers.py b/zeeguu/api/utils/route_wrappers.py index 6f5fb35a..5ebb8095 100644 --- a/zeeguu/api/utils/route_wrappers.py +++ b/zeeguu/api/utils/route_wrappers.py @@ -24,6 +24,7 @@ def requires_session(view): @functools.wraps(view) def wrapped_view(*args, **kwargs): + print("--> /" + view.__name__) try: session_uuid = flask.request.args["session"] user_id, session_expiry_time = SESSION_CACHE.get( From f90c9754ed08149e0ea9695385f627ca17e63903 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Thu, 7 Nov 2024 10:38:56 +0100 Subject: [PATCH 37/71] Added a way to skip testing the feed - Feeds can be tested locally and in production we can skip this test, especially for newspaper it takes quite a while to crawl. --- tools/add_feed.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/tools/add_feed.py b/tools/add_feed.py index a1c839cd..fbd64a3e 100644 --- a/tools/add_feed.py +++ b/tools/add_feed.py @@ -8,9 +8,10 @@ app.app_context().push() -def create_and_test_feed(url: str, feed_type: int): +def create_and_test_feed(url: str, feed_type: int, test_feed: bool): feed = Feed.from_url(url, feed_type=feed_type) - print(feed.feed_health_info()) + if test_feed: + print(feed.feed_health_info()) return feed @@ -19,17 +20,23 @@ def main(): _feed_url = input("Feed url: ") print(f"Available feed types: '{FEED_TYPE}'") feed_type = int(input("What feed type is it? : ")) - test_feed = create_and_test_feed(_feed_url, feed_type) + test_feed = input("Do you want to test to parse the feed (1=yes, else n) : ") == "1" + feed = create_and_test_feed(_feed_url, feed_type, test_feed) - feed_name = input(f"Feed name (Enter for: {test_feed.title}): ") or test_feed.title + feed_name = input(f"Feed name (Enter for: {feed.title}): ") or feed.title print(f"= {feed_name}") - icon_name = input("Icon name to be found in resources folder (e.g. 20min.png): ") + default_icon_name = f"{feed_name.lower().replace(" ", "-")}.png" + icon_name = ( + input( + f"Icon name to be found in resources folder (e.g. {default_icon_name}): " + ) + or default_icon_name + ) print(f"= {icon_name}") description = ( - input(f"Description (Enter for: {test_feed.description}): ") - or test_feed.description + input(f"Description (Enter for: {feed.description}): ") or feed.description ) print(f"= {description}") From 643fbc39ff858c44aee3e52006652fa4fe1e712a Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Thu, 7 Nov 2024 10:39:28 +0100 Subject: [PATCH 38/71] Reduced duplication --- tools/crawl_summary/crawl_report.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tools/crawl_summary/crawl_report.py b/tools/crawl_summary/crawl_report.py index 9d4a0247..12c56fe8 100644 --- a/tools/crawl_summary/crawl_report.py +++ b/tools/crawl_summary/crawl_report.py @@ -4,6 +4,7 @@ import inspect import json import pathlib +from zeeguu.core.util.time import get_server_time_utc STR_DATETIME_FORMAT = "%d_%m_%y_%H_%M_%S" CRAWL_REPORT_DATA = os.environ.get( @@ -188,12 +189,16 @@ def __validate_lang(self, lang: str): ) return True - def get_total_non_quality_counts(self, langs_to_load: list[str] = None): + def __load_languages(self, langs_to_load: list[str] = None): if langs_to_load is None: langs_to_load = self.data["lang"].keys() else: for lang in langs_to_load: self.__validate_lang(lang) + return langs_to_load + + def get_total_non_quality_counts(self, langs_to_load: list[str] = None): + langs_to_load = self.__load_languages(langs_to_load) total_counts = Counter() for lang in langs_to_load: @@ -203,11 +208,8 @@ def get_total_non_quality_counts(self, langs_to_load: list[str] = None): return total_counts def get_total_removed_sents_counts(self, langs_to_load: list[str] = None): - if langs_to_load is None: - langs_to_load = self.data["lang"].keys() - else: - for lang in langs_to_load: - self.__validate_lang(lang) + langs_to_load = self.__load_languages(langs_to_load) + total_counts = Counter() for lang in langs_to_load: From 30414e6aaa63f83af494d36bc49e6c2785806f08 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Thu, 7 Nov 2024 10:40:02 +0100 Subject: [PATCH 39/71] Added a table to show Feed activity --- tools/report_generator/data_extractor.py | 25 ++++++++++++++++++++++- tools/report_generator/generate_report.py | 7 +++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/tools/report_generator/data_extractor.py b/tools/report_generator/data_extractor.py index e8bc8fe1..b004494b 100644 --- a/tools/report_generator/data_extractor.py +++ b/tools/report_generator/data_extractor.py @@ -49,6 +49,29 @@ def get_article_new_topics_df(self, feed_df): self.__add_feed_name(df, feed_df) return df + def get_days_since_last_crawl(self): + print("Getting Feeds Last Crawl Time...") + query = f""" + SELECT + feed_id, + f.title, + DATEDIFF(CURDATE(), MAX(published_time)) days_since_last_article, + DATEDIFF(CURDATE(), f.last_crawled_time) days_since_last_feed_crawl + FROM + article a + JOIN feed f ON a.feed_id = f.id + WHERE + f.deactivated = 0 + GROUP by + feed_id + HAVING + days_since_last_feed_crawl <= {self.DAYS_FOR_REPORT} + ORDER BY + days_since_last_article DESC; + """ + df = pd.read_sql(query, con=self.db_connection) + return df + def get_article_df(self, feed_df): print("Getting Articles...") query = f"""SELECT a.*, l.name Language @@ -59,7 +82,7 @@ def get_article_df(self, feed_df): df = pd.read_sql(query, con=self.db_connection) self.__add_feed_name(df, feed_df) return df - + def get_url_keyword_counts(self, min_count=100): print("Getting URL keyword counts...") # Update with values from the code. diff --git a/tools/report_generator/generate_report.py b/tools/report_generator/generate_report.py index 5f560602..cd8ebf68 100644 --- a/tools/report_generator/generate_report.py +++ b/tools/report_generator/generate_report.py @@ -573,6 +573,7 @@ def generate_html_page(): top_filtered_searches = data_extractor.get_top_search_filters() newly_added_search_subscriptions = data_extractor.get_added_search_subscriptions() pd_new_url_keywords = data_extractor.get_url_keyword_counts() + pd_feed_innactivity_time = data_extractor.get_days_since_last_crawl() crawl_report = CrawlReport() crawl_report.load_crawl_report_data(DAYS_FOR_REPORT) total_days_from_crawl_report = crawl_report.get_days_from_crawl_report_date() @@ -659,6 +660,9 @@ def generate_html_page(): +

Possible Innactive feeds:

+

Full table

+ {generate_html_table(pd_feed_innactivity_time.head(10))}

Articles Rejected:

{warning_crawl_range}

{get_total_reject_article_reason_table(crawl_report.get_total_non_quality_counts())} @@ -715,6 +719,9 @@ def generate_html_page():

Newly url keywords without topics:

URL Keywords that occur more than 100 times in articles and are not mapped to a topic. They are language unique.

{get_new_url_keywords_table(pd_new_url_keywords) if DAYS_FOR_REPORT <= 7 else "

Skipped due to long period.

"} +
+

Feed activity:

+ {generate_html_table(pd_feed_innactivity_time)} """ From 4b6954cf2c850217bf05477914bb9615172af694 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Thu, 7 Nov 2024 10:59:45 +0100 Subject: [PATCH 40/71] Update article_downloader.py - Added a way to check if the img parsed is large enough to be displayed. This is mainly to fix the issue where some articles get the website's favicon as the top image, which is not what is intended. --- .../content_retriever/article_downloader.py | 45 ++++++++++++++++--- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/zeeguu/core/content_retriever/article_downloader.py b/zeeguu/core/content_retriever/article_downloader.py index 0fd37c40..849d4ce8 100644 --- a/zeeguu/core/content_retriever/article_downloader.py +++ b/zeeguu/core/content_retriever/article_downloader.py @@ -62,6 +62,13 @@ def banned_url(url): banned = [ "https://www.dr.dk/sporten/seneste-sport/", "https://www.dr.dk/nyheder/seneste/", + # Old Text interface for TVs + "https://www1.wdr.de/wdrtext/", + # Videos + "https://www.tagesschau.de/multimedia", + "https://www.1jour1actu.com/non-classe", + # Paywalled articles: + "https://www.faz.net/pro", ] for each in banned: if url.startswith(each): @@ -221,6 +228,7 @@ def download_from_feed( except Exception as e: import traceback + print(e) traceback.print_stack() capture_to_sentry(e) if hasattr(e, "message"): @@ -306,14 +314,26 @@ def download_feed_item(session, feed, feed_item, url, crawl_report): raise SkippedForLowQuality(reason) if np_article.top_image != "": - new_article.img_url = Url.find_or_create(session, np_article.top_image) + # from https://stackoverflow.com/questions/7391945/how-do-i-read-image-data-from-a-url-in-python + from PIL import Image + import requests + from io import BytesIO + + response = requests.get(np_article.top_image) + im = Image.open(BytesIO(response.content)) + im_x, im_y = im.size + # Quality Check that the image is at least 300x300 ( not an icon ) + if im_x < 300 and im_y < 300: + print("Skipped image due to low resolution") + else: + new_article.img_url = Url.find_or_create(session, np_article.top_image) old_topics = add_topics(new_article, session) logp(f"Old Topics ({old_topics})") url_keywords = add_url_keywords(new_article, session) logp(f"Topic Keywords: ({url_keywords})") if SEMANTIC_SEARCH_AVAILABLE: - origin_type, topics = add_new_topics(new_article, feed, url_keywords, session) + _, topics = add_new_topics(new_article, feed, url_keywords, session) logp(f"New Topics ({topics})") session.add(new_article) return new_article @@ -343,6 +363,7 @@ def add_new_topics(new_article, feed, url_keywords, session): new_article.add_new_topic(topic, session, TopicOriginType.HARDSET.value) session.add(new_article) return TopicOriginType.HARDSET.value, [topic.title] + # Try setting the Topics based on URLs topics = [] topics_added = set() @@ -361,7 +382,11 @@ def add_new_topics(new_article, feed, url_keywords, session): if len(topics) > 0: print("Used URL PARSED") session.add(new_article) - return TopicOriginType.URL_PARSED.value, [t.title for t in topics] + # If we have only one topic and that is News, we will try to infer. + if not (len(topics) == 1 and 9 in topics_added): + return TopicOriginType.URL_PARSED.value, [ + t.new_topic.title for t in new_article.new_topics + ] from collections import Counter @@ -383,8 +408,18 @@ def add_new_topics(new_article, feed, url_keywords, session): top_topic, session, TopicOriginType.INFERRED.value ) session.add(new_article) - return TopicOriginType.INFERRED.value, [top_topic.title] - return None, [] + return TopicOriginType.INFERRED.value, [ + t.new_topic.title for t in new_article.new_topics + ] + + return ( + (None, []) + if len(topics) == 0 + else ( + TopicOriginType.URL_PARSED.value, + [t.new_topic.title for t in new_article.new_topics], + ) + ) def add_url_keywords(new_article, session): From b164b2bcfa5e5f8294836818f661673e51d1ad4a Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Thu, 7 Nov 2024 11:00:32 +0100 Subject: [PATCH 41/71] Added a timeout of 30 seconds to newspaper --- zeeguu/core/feed_handler/newspaperfeed.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/zeeguu/core/feed_handler/newspaperfeed.py b/zeeguu/core/feed_handler/newspaperfeed.py index d7c6203c..4ec3637c 100644 --- a/zeeguu/core/feed_handler/newspaperfeed.py +++ b/zeeguu/core/feed_handler/newspaperfeed.py @@ -37,9 +37,12 @@ def get_feed_articles(self) -> list[dict]: # This makes it complicated to assign a feed and download the articles found at that time. # Currently, it ignores the newspaper's cache and justs uses ours. if self.use_cache: - news_feed = newspaper.build(self.url) + news_feed = newspaper.build(self.url, request_timeout=30) else: - news_feed = newspaper.build(self.url, memoize_articles=False) + print("NOT skipping cached articles...") + news_feed = newspaper.build( + self.url, memoize_articles=False, request_timeout=30 + ) feed_data = news_feed.articles feed_items = [] From 3471d66d343eb593fadfd934dc21594b65ff5c7a Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Thu, 7 Nov 2024 10:40:34 +0100 Subject: [PATCH 42/71] Fixed error code in crawling - Added a code to match the language mismatch error - Error was not returning expected of parameters --- zeeguu/core/content_quality/quality_filter.py | 5 +++-- zeeguu/core/model/article_broken_code_map.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/zeeguu/core/content_quality/quality_filter.py b/zeeguu/core/content_quality/quality_filter.py index f11e4a79..d476bd34 100644 --- a/zeeguu/core/content_quality/quality_filter.py +++ b/zeeguu/core/content_quality/quality_filter.py @@ -48,7 +48,7 @@ def sufficient_quality_html(html): return True, "", "" -def sufficient_quality_plain_text(text, lang_code = None): +def sufficient_quality_plain_text(text, lang_code=None): word_count = len(text.split()) if word_count < Article.MINIMUM_WORD_COUNT: return ( @@ -77,6 +77,7 @@ def sufficient_quality_plain_text(text, lang_code = None): return ( False, f"Article language '{art_lang}', does not match feed language: '{lang_code}'.", + LowQualityTypes.LANGUAGE_DOES_NOT_MATCH_FEED, ) for each in LIVE_BLOG_KIND_OF_PATTERNS: @@ -96,7 +97,7 @@ def sufficient_quality_plain_text(text, lang_code = None): return True, "", "" -def sufficient_quality(art: newspaper.Article, lang_code = None) -> tuple[bool, str, str]: +def sufficient_quality(art: newspaper.Article, lang_code=None) -> tuple[bool, str, str]: res, reason, code = sufficient_quality_html(art.html) if not res: return False, reason, code diff --git a/zeeguu/core/model/article_broken_code_map.py b/zeeguu/core/model/article_broken_code_map.py index e9265318..0e8d87e9 100644 --- a/zeeguu/core/model/article_broken_code_map.py +++ b/zeeguu/core/model/article_broken_code_map.py @@ -14,6 +14,7 @@ class LowQualityTypes: INCOMPLETE_PATTERN = "INCOMPLETE_PATTERN" LIVE_BLOG = "LIVE_BLOG" ML_PREDICTION = "ML_PREDICTION" + LANGUAGE_DOES_NOT_MATCH_FEED = "LANGUAGE_DOES_NOT_MATCH_FEED" class ArticleBrokenMap(db.Model): From f225d68256c6e477dc5291fbc06837bad4ba4c2d Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Thu, 7 Nov 2024 12:52:26 +0100 Subject: [PATCH 43/71] Added a try catch when parsing the image - This mostly is in the case of the mock-urls which was causing the test suit to fail, but in general if for some reason we don't parse an image it's good that it doesn't crash the whole article. --- .../content_retriever/article_downloader.py | 19 +++++++++++-------- zeeguu/core/test/mocking_the_web.py | 11 +++++++++++ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/zeeguu/core/content_retriever/article_downloader.py b/zeeguu/core/content_retriever/article_downloader.py index 849d4ce8..fba7a341 100644 --- a/zeeguu/core/content_retriever/article_downloader.py +++ b/zeeguu/core/content_retriever/article_downloader.py @@ -319,14 +319,17 @@ def download_feed_item(session, feed, feed_item, url, crawl_report): import requests from io import BytesIO - response = requests.get(np_article.top_image) - im = Image.open(BytesIO(response.content)) - im_x, im_y = im.size - # Quality Check that the image is at least 300x300 ( not an icon ) - if im_x < 300 and im_y < 300: - print("Skipped image due to low resolution") - else: - new_article.img_url = Url.find_or_create(session, np_article.top_image) + try: + response = requests.get(np_article.top_image) + im = Image.open(BytesIO(response.content)) + im_x, im_y = im.size + # Quality Check that the image is at least 300x300 ( not an icon ) + if im_x < 300 and im_y < 300: + print("Skipped image due to low resolution") + else: + new_article.img_url = Url.find_or_create(session, np_article.top_image) + except Exception as e: + print(f"Failed to parse image: '{e}'") old_topics = add_topics(new_article, session) logp(f"Old Topics ({old_topics})") diff --git a/zeeguu/core/test/mocking_the_web.py b/zeeguu/core/test/mocking_the_web.py index 1be58b85..f934e391 100644 --- a/zeeguu/core/test/mocking_the_web.py +++ b/zeeguu/core/test/mocking_the_web.py @@ -50,6 +50,15 @@ URL_ML_JP_PAYWALL = "https://jyllands-posten.dk/kultur/ECE16582800/puk-damsgaard-leverer-voldsom-kritik-af-vestens-krig-i-afghanistan/#:~:text=Man%20kommer%20ikke%20i%20godt,og%20ligestilling%20i%20al%20evighed." +URL_SPIEGEL_IMG_2 = ( + "http://cdn2.spiegel.de/images/image-1387139-860_poster_16x9-fqsg-1387139.jpg" +) + +URL_SPIEGEL_IMG_1 = ( + "http://cdn1.spiegel.de/images/image-1387130-860_poster_16x9-gljs-1387130.jpg" +) + + URLS_TO_MOCK = { URL_BLINDEN_UND_ELEPHANT: "blinden_und_elefant.html", URL_CNN_KATHMANDU: "cnn_kathmandu.html", @@ -84,6 +93,8 @@ READABILITY_SERVER_CLEANUP_URI + URL_ML_JP_PAYWALL: "jp_article_example.json", # tldextract, dependency of newspaper reaches out for this and makes our tests fail if we don't have net "https://publicsuffix.org/list/public_suffix_list.dat": "public_suffix_list.dat", + URL_SPIEGEL_IMG_1: "spiegel_nancy.html", + URL_SPIEGEL_IMG_2: "spiegel_nancy.html", } From 49997e987a5cc1158355dcf8ad67e8329a5c50c3 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Mon, 11 Nov 2024 10:21:12 +0100 Subject: [PATCH 44/71] Added support for pagination for saved articles. --- zeeguu/api/endpoints/user_articles.py | 15 +++++++++++++++ zeeguu/core/model/personal_copy.py | 14 ++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/zeeguu/api/endpoints/user_articles.py b/zeeguu/api/endpoints/user_articles.py index 66996152..211a81e5 100644 --- a/zeeguu/api/endpoints/user_articles.py +++ b/zeeguu/api/endpoints/user_articles.py @@ -55,6 +55,21 @@ def user_articles_recommended(count: int = 20, page: int = 0): @api.route("/user_articles/saved", methods=["GET"]) +@api.route("/user_articles/saved/", methods=["GET"]) +@cross_domain +@requires_session +def saved_articles(page: int = None): + user = User.find_by_id(flask.g.user_id) + if page is not None: + saves = PersonalCopy.get_page_for(user, page) + else: + saves = PersonalCopy.all_for(user) + + article_infos = [UserArticle.user_article_info(user, e) for e in saves] + + return json_result(article_infos) + + @cross_domain @requires_session def saved_articles(): diff --git a/zeeguu/core/model/personal_copy.py b/zeeguu/core/model/personal_copy.py index 3030d87b..f420c83a 100644 --- a/zeeguu/core/model/personal_copy.py +++ b/zeeguu/core/model/personal_copy.py @@ -7,6 +7,8 @@ from zeeguu.core.model import db +TOTAL_ITEMS_PER_PAGE = 20 + class PersonalCopy(db.Model): __table_args__ = {"mysql_collate": "utf8_bin"} @@ -30,6 +32,18 @@ def exists_for(cls, user, article): PersonalCopy.query.filter_by(user_id=user.id, article_id=article.id).all() ) + @classmethod + def get_page_for(cls, user, page): + return ( + Article.query.join(PersonalCopy) + .filter(PersonalCopy.user_id == user.id) + .filter(Article.language_id == user.learned_language_id) + .order_by(desc(PersonalCopy.id)) + .limit(TOTAL_ITEMS_PER_PAGE) + .offset(TOTAL_ITEMS_PER_PAGE * page) + .all() + ) + @classmethod def all_for(cls, user): return ( From 0d8cc111f2908d6a736511773564000d865f8820 Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Mon, 11 Nov 2024 15:46:13 +0100 Subject: [PATCH 45/71] updated comment --- zeeguu/core/model/url_keyword.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/zeeguu/core/model/url_keyword.py b/zeeguu/core/model/url_keyword.py index 535d38b0..96d48097 100644 --- a/zeeguu/core/model/url_keyword.py +++ b/zeeguu/core/model/url_keyword.py @@ -15,6 +15,12 @@ class UrlKeyword(db.Model): These are words extracted from the URL that can be used as keywords for the New Topic Table. Each keyword is associated with a language, as some language might have the same word for 2 different topics. + + We have a url keyword entry even if it is not mapped on a topic. + So pretty much, every time we find a URL keyword we put it in here + And we also map it to the article. + + """ EXCLUDE_TOPICS = set( From c7036ddb8b371446fb3fde1bd0063e0bcdc44668 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Tue, 12 Nov 2024 13:20:19 +0100 Subject: [PATCH 46/71] Removed all the logic relating to Topics - First, we remove all the logic relating to the topics. I have checked that the Web and Backend seem to be working OK. --- tools/old/add_standard_topics.py | 1 - zeeguu/api/endpoints/topics.py | 216 ------------------ .../user_account_deletion.py | 4 - .../elastic_recommender.py | 30 +-- .../content_retriever/article_downloader.py | 16 +- zeeguu/core/elastic/elastic_query_builder.py | 2 - zeeguu/core/elastic/indexing.py | 85 ++----- zeeguu/core/model/__init__.py | 6 - zeeguu/core/model/article.py | 16 -- zeeguu/core/test/test_article.py | 9 +- zeeguu/core/test/test_retrieve_and_compute.py | 17 -- zeeguu/core/test/test_user_article.py | 1 - 12 files changed, 29 insertions(+), 374 deletions(-) diff --git a/tools/old/add_standard_topics.py b/tools/old/add_standard_topics.py index e8cf680d..3fc03d07 100644 --- a/tools/old/add_standard_topics.py +++ b/tools/old/add_standard_topics.py @@ -14,7 +14,6 @@ import zeeguu.core from zeeguu.core.model.topic import Topic from zeeguu.core.model.language import Language -from zeeguu.core.model.localized_topic import LocalizedTopic db_session = zeeguu.core.model.db.session diff --git a/zeeguu/api/endpoints/topics.py b/zeeguu/api/endpoints/topics.py index 50c9a69d..e33da2d4 100644 --- a/zeeguu/api/endpoints/topics.py +++ b/zeeguu/api/endpoints/topics.py @@ -2,13 +2,9 @@ from zeeguu.logging import log from flask import request from zeeguu.core.model import ( - Topic, NewTopic, - TopicSubscription, NewTopicSubscription, - TopicFilter, NewTopicFilter, - LocalizedTopic, Language, User, ) @@ -21,41 +17,13 @@ db_session = zeeguu.core.model.db.session -SUBSCRIBE_TOPIC = "subscribe_topic" SUBSCRIBE_NEW_TOPIC = "subscribe_new_topic" -UNSUBSCRIBE_TOPIC = "unsubscribe_topic" UNSUBSCRIBE_NEW_TOPIC = "unsubscribe_new_topic" -SUBSCRIBED_TOPICS = "subscribed_topics" SUBSCRIBED_NEW_TOPICS = "subscribed_new_topics" -FILTER_TOPIC = "filter_topic" -UNFILTER_TOPIC = "unfilter_topic" -FILTERED_TOPICS = "filtered_topics" FILTER_NEW_TOPIC = "filter_new_topic" UNFILTER_NEW_TOPIC = "unfilter_new_topic" FILTERED_NEW_TOPICS = "filtered_new_topics" - -# --------------------------------------------------------------------------- -@api.route(f"/{SUBSCRIBE_TOPIC}", methods=("POST",)) -# --------------------------------------------------------------------------- -@cross_domain -@requires_session -def subscribe_to_topic_with_id(): - """ - :param: topic_id -- the id of the topic to be subscribed to. - Subscribe to the topic with the given id - - :return: "OK" in case of success - """ - - topic_id = int(request.form.get("topic_id", "")) - user = User.find_by_id(flask.g.user_id) - topic_object = Topic.find_by_id(topic_id) - TopicSubscription.find_or_create(db_session, user, topic_object) - - return "OK" - - # --------------------------------------------------------------------------- @api.route(f"/{SUBSCRIBE_NEW_TOPIC}", methods=("POST",)) # --------------------------------------------------------------------------- @@ -104,63 +72,6 @@ def unsubscribe_from_new_topic(): return "OK" -# --------------------------------------------------------------------------- -@api.route(f"/{UNSUBSCRIBE_TOPIC}", methods=("POST",)) -# --------------------------------------------------------------------------- -@cross_domain -@requires_session -def unsubscribe_from_topic(): - """ - A user can unsubscribe from the topic with a given ID - - :return: "OK" in case of success - """ - - topic_id = int(request.form.get("topic_id", "")) - user = User.find_by_id(flask.g.user_id) - try: - to_delete = TopicSubscription.with_topic_id(topic_id, user) - db_session.delete(to_delete) - db_session.commit() - except Exception as e: - from sentry_sdk import capture_exception - - capture_exception(e) - return "OOPS. FEED AIN'T THERE IT SEEMS (" + str(e) + ")" - - return "OK" - - -# --------------------------------------------------------------------------- -@api.route(f"/{SUBSCRIBED_TOPICS}", methods=("GET",)) -# --------------------------------------------------------------------------- -@cross_domain -@requires_session -def get_subscribed_topics(): - """ - A user might be subscribed to multiple topics at once. - This endpoint returns them as a list. - - :return: a json list with feeds for which the user is registered; - every feed in this list is a dictionary with the following info: - id = unique id of the topic; - title = - """ - user = User.find_by_id(flask.g.user_id) - subscriptions = TopicSubscription.all_for_user(user) - topic_list = [] - for sub in subscriptions: - try: - topic_list.append(sub.topic.as_dictionary()) - except Exception as e: - from sentry_sdk import capture_exception - - capture_exception(e) - log(str(e)) - - return json_result(topic_list) - - # --------------------------------------------------------------------------- @api.route(f"/{SUBSCRIBED_NEW_TOPICS}", methods=("GET",)) # --------------------------------------------------------------------------- @@ -190,41 +101,6 @@ def get_subscribed_new_topics(): return json_result(topic_list) - -# --------------------------------------------------------------------------- -@api.route("/available_topics", methods=("GET",)) -# --------------------------------------------------------------------------- -@cross_domain -@requires_session -def get_available_topics(): - """ - Get a list of interesting topics for the given language. - Interesting topics are for now defined as: - - There are articles with that topic in the language - - The topic is not followed yet - - The topic is not in the filters list - - :return: - """ - topic_data = [] - user = User.find_by_id(flask.g.user_id) - already_filtered = [each.topic for each in TopicFilter.all_for_user(user)] - already_subscribed = [each.topic for each in TopicSubscription.all_for_user(user)] - - reading_languages = Language.all_reading_for_user(user) - - loc_topics = [] - for each in reading_languages: - loc_topics.extend(LocalizedTopic.all_for_language(each)) - - topics = [each.topic for each in loc_topics] - - for topic in topics: - if (topic not in already_filtered) and (topic not in already_subscribed): - topic_data.append(topic.as_dictionary()) - return json_result(topic_data) - - # --------------------------------------------------------------------------- @api.route("/available_new_topics", methods=("GET",)) # --------------------------------------------------------------------------- @@ -332,95 +208,3 @@ def get_subscribed_new_filters(): return json_result(filter_list) - -# --------------------------------------------------------------------------- -@api.route(f"/{FILTER_TOPIC}", methods=("POST",)) -# --------------------------------------------------------------------------- -@cross_domain -@requires_session -def subscribe_to_filter_with_id(): - """ - :param: filter_id -- the id of the filter to be subscribed to. - Subscribe to the filter with the given id - - :return: "OK" in case of success - """ - - filter_id = int(request.form.get("filter_id", "")) - user = User.find_by_id(flask.g.user_id) - filter_object = Topic.find_by_id(filter_id) - TopicFilter.find_or_create(db_session, user, filter_object) - - return "OK" - - -# --------------------------------------------------------------------------- -@api.route(f"/{UNFILTER_TOPIC}", methods=("POST",)) -# --------------------------------------------------------------------------- -@cross_domain -@requires_session -def unsubscribe_from_filter(): - """ - A user can unsubscribe from the filter with a given ID - :return: OK / ERROR - """ - - filter_id = int(request.form.get("topic_id", "")) - user = User.find_by_id(flask.g.user_id) - try: - to_delete = TopicFilter.with_topic_id(filter_id, user) - db_session.delete(to_delete) - db_session.commit() - except Exception as e: - from sentry_sdk import capture_exception - - capture_exception(e) - return "OOPS. FILTER AIN'T THERE IT SEEMS (" + str(e) + ")" - - return "OK" - - -# --------------------------------------------------------------------------- -@api.route(f"/{FILTERED_TOPICS}", methods=("GET",)) -# --------------------------------------------------------------------------- -@cross_domain -@requires_session -def get_subscribed_filters(): - """ - A user might be subscribed to multiple filters at once. - This endpoint returns them as a list. - - :return: a json list with filters for which the user is registered; - every filter in this list is a dictionary with the following info: - id = unique id of the topic; - title = - """ - user = User.find_by_id(flask.g.user_id) - filters = TopicFilter.all_for_user(user) - filter_list = [] - for fil in filters: - try: - filter_list.append(fil.topic.as_dictionary()) - except Exception as e: - from sentry_sdk import capture_exception - - capture_exception(e) - log(str(e)) - - return json_result(filter_list) - - -@api.route(f"/cache_articles/", methods=("GET",)) -def cache_articles(code): - if code != zeeguu.core.app.config.get("PRIVATE_API_CODE"): - return "Nope" - - from zeeguu.core.model import Topic, Language - - for each in Topic.get_all_topics(): - each.all_articles() - - for each in Language.available_languages(): - each.get_articles() - - return "OK" diff --git a/zeeguu/core/account_management/user_account_deletion.py b/zeeguu/core/account_management/user_account_deletion.py index 13bcd3dd..4c65e225 100644 --- a/zeeguu/core/account_management/user_account_deletion.py +++ b/zeeguu/core/account_management/user_account_deletion.py @@ -16,8 +16,6 @@ from zeeguu.core.model import ( SearchSubscription, - TopicFilter, - TopicSubscription, Teacher, TeacherCohortMap, Session, @@ -32,8 +30,6 @@ tables_to_modify = [ SearchSubscription, - TopicFilter, - TopicSubscription, Session, Teacher, TeacherCohortMap, diff --git a/zeeguu/core/content_recommender/elastic_recommender.py b/zeeguu/core/content_recommender/elastic_recommender.py index 52e10fd0..99561617 100644 --- a/zeeguu/core/content_recommender/elastic_recommender.py +++ b/zeeguu/core/content_recommender/elastic_recommender.py @@ -14,8 +14,6 @@ from zeeguu.core.model import ( Article, - TopicFilter, - TopicSubscription, NewTopicFilter, NewTopicSubscription, SearchFilter, @@ -53,25 +51,7 @@ def _prepare_user_constraints(user): unwanted_user_searches.append(user_search_filter.search.keywords) print(f"keywords to exclude: {unwanted_user_searches}") - # 2. Topics to exclude / filter out - # ================================= - excluded_topics = TopicFilter.all_for_user(user) - topics_to_exclude = [ - each.topic.title for each in excluded_topics if each is not None - ] - print(f"topics to exclude: {topics_to_exclude}") - - # 3. Topics subscribed, and thus to include - # ========================================= - topic_subscriptions = TopicSubscription.all_for_user(user) - topics_to_include = [ - subscription.topic.title - for subscription in topic_subscriptions - if subscription is not None - ] - print(f"topics to include: {topic_subscriptions}") - - # 4. New Topics to exclude / filter out + # 2. New Topics to exclude / filter out # ================================= excluded_new_topics = NewTopicFilter.all_for_user(user) new_topics_to_exclude = [ @@ -79,7 +59,7 @@ def _prepare_user_constraints(user): ] print(f"New Topics to exclude: {excluded_new_topics}") - # 5. New Topics subscribed, and thus to include + # 3. New Topics subscribed, and thus to include # ========================================= topic_new_subscriptions = NewTopicSubscription.all_for_user(user) new_topics_to_include = [ @@ -102,8 +82,6 @@ def _prepare_user_constraints(user): language, upper_bounds, lower_bounds, - _list_to_string(topics_to_include), - _list_to_string(topics_to_exclude), _new_topics_to_string(new_topics_to_include), _new_topics_to_string(new_topics_to_exclude), _list_to_string(wanted_user_searches), @@ -143,8 +121,6 @@ def article_recommendations_for_user( language, upper_bounds, lower_bounds, - topics_to_include, - topics_to_exclude, new_topics_to_include, new_topics_to_exclude, wanted_user_searches, @@ -157,8 +133,6 @@ def article_recommendations_for_user( # build the query using elastic_query_builder query_body = build_elastic_recommender_query( count, - topics_to_include, - topics_to_exclude, wanted_user_searches, unwanted_user_searches, language, diff --git a/zeeguu/core/content_retriever/article_downloader.py b/zeeguu/core/content_retriever/article_downloader.py index fba7a341..58b70985 100644 --- a/zeeguu/core/content_retriever/article_downloader.py +++ b/zeeguu/core/content_retriever/article_downloader.py @@ -25,7 +25,7 @@ from zeeguu.core.content_quality.quality_filter import sufficient_quality from zeeguu.core.content_cleaning import cleanup_text_w_crawl_report from zeeguu.core.emailer.zeeguu_mailer import ZeeguuMailer -from zeeguu.core.model import Url, Feed, LocalizedTopic, UrlKeyword, NewTopic +from zeeguu.core.model import Url, Feed, UrlKeyword, NewTopic from zeeguu.core.model.new_article_topic_map import TopicOriginType import requests @@ -331,8 +331,6 @@ def download_feed_item(session, feed, feed_item, url, crawl_report): except Exception as e: print(f"Failed to parse image: '{e}'") - old_topics = add_topics(new_article, session) - logp(f"Old Topics ({old_topics})") url_keywords = add_url_keywords(new_article, session) logp(f"Topic Keywords: ({url_keywords})") if SEMANTIC_SEARCH_AVAILABLE: @@ -342,18 +340,6 @@ def download_feed_item(session, feed, feed_item, url, crawl_report): return new_article -def add_topics(new_article, session): - topics = [] - for loc_topic in LocalizedTopic.query.all(): - if loc_topic.language == new_article.language and loc_topic.matches_article( - new_article - ): - topics.append(loc_topic.topic.title) - new_article.add_topic(loc_topic.topic) - session.add(new_article) - return topics - - def add_new_topics(new_article, feed, url_keywords, session): HARDCODED_FEEDS = { 102: 8, # The Onion EN diff --git a/zeeguu/core/elastic/elastic_query_builder.py b/zeeguu/core/elastic/elastic_query_builder.py index 225da517..9ffba174 100644 --- a/zeeguu/core/elastic/elastic_query_builder.py +++ b/zeeguu/core/elastic/elastic_query_builder.py @@ -43,8 +43,6 @@ def more_like_this_query(count, article_text, language, page=0): def build_elastic_recommender_query( count, - topics, - unwanted_topics, user_topics, unwanted_user_topics, language, diff --git a/zeeguu/core/elastic/indexing.py b/zeeguu/core/elastic/indexing.py index 3c6d8eb7..b407a3c8 100644 --- a/zeeguu/core/elastic/indexing.py +++ b/zeeguu/core/elastic/indexing.py @@ -1,4 +1,4 @@ -from zeeguu.core.model import Topic, UrlKeyword, NewTopic +from zeeguu.core.model import UrlKeyword, NewTopic from zeeguu.core.model.article import article_topic_map from zeeguu.core.model.article_url_keyword_map import ArticleUrlKeywordMap from zeeguu.core.model.new_article_topic_map import TopicOriginType, NewArticleTopicMap @@ -8,19 +8,6 @@ from zeeguu.core.semantic_vector_api import get_embedding_from_article -def find_topics(article_id, session): - article_topic = ( - session.query(Topic) - .join(article_topic_map) - .filter(article_topic_map.c.article_id == article_id) - ) - topics = "" - for t in article_topic: - topics = topics + str(t.title) + " " - - return topics.rstrip() - - def find_new_topics(article_id, session): article_topics = ( session.query(NewTopic) @@ -53,47 +40,27 @@ def find_filter_url_keywords(article_id, session): return topic_kewyords -def document_from_article(article, session, topics=None, is_v7=True): - old_topics = find_topics(article.id, session) - - if is_v7: - print("## Warning: Version for ES is 7, using old indexing system...") - doc = { - "title": article.title, - "author": article.authors, - "content": article.content, - "summary": article.summary, - "word_count": article.word_count, - "published_time": article.published_time, - "topics": old_topics, - "language": article.language.name, - "fk_difficulty": article.fk_difficulty, - "lr_difficulty": DifficultyLingoRank.value_for_article(article), - "url": article.url.as_string(), - "video": article.video, - } - else: - topics, topics_inferred = find_new_topics(article.id, session) - doc = { - "title": article.title, - "author": article.authors, - "content": article.content, - "summary": article.summary, - "word_count": article.word_count, - "published_time": article.published_time, - "old_topics": old_topics, - "topics": [t.title for t in topics], - # We need to avoid using these as a way to classify further documents - # (we should rely on the human labels to classify further articles) - # rather than infer on inferences. - "topics_inferred": [t.title for t in topics_inferred], - "language": article.language.name, - "fk_difficulty": article.fk_difficulty, - "lr_difficulty": DifficultyLingoRank.value_for_article(article), - "sem_vec": get_embedding_from_article(article), - "url": article.url.as_string(), - "video": article.video, - } +def document_from_article(article, session): + topics, topics_inferred = find_new_topics(article.id, session) + doc = { + "title": article.title, + "author": article.authors, + "content": article.content, + "summary": article.summary, + "word_count": article.word_count, + "published_time": article.published_time, + "topics": [t.title for t in topics], + # We need to avoid using these as a way to classify further documents + # (we should rely on the human labels to classify further articles) + # rather than infer on inferences. + "topics_inferred": [t.title for t in topics_inferred], + "language": article.language.name, + "fk_difficulty": article.fk_difficulty, + "lr_difficulty": DifficultyLingoRank.value_for_article(article), + "sem_vec": get_embedding_from_article(article), + "url": article.url.as_string(), + "video": article.video, + } return doc @@ -110,10 +77,9 @@ def create_or_update(article, session): return res -def create_or_update_bulk_docs(article, session, topics=None): +def create_or_update_bulk_docs(article, session): es = Elasticsearch(ES_CONN_STRING) - es_version = int(es.info()["version"]["number"][0]) - doc_data = document_from_article(article, session, topics, is_v7=es_version == 7) + doc_data = document_from_article(article, session) doc = {} doc["_id"] = article.id doc["_index"] = ES_ZINDEX @@ -134,8 +100,7 @@ def index_in_elasticsearch(new_article, session): """ try: es = Elasticsearch(ES_CONN_STRING) - es_version = int(es.info()["version"]["number"][0]) - doc = document_from_article(new_article, session, is_v7=es_version == 7) + doc = document_from_article(new_article, session) res = es.index(index=ES_ZINDEX, id=new_article.id, document=doc) except Exception as e: diff --git a/zeeguu/core/model/__init__.py b/zeeguu/core/model/__init__.py index 1f617d4d..5966ebab 100644 --- a/zeeguu/core/model/__init__.py +++ b/zeeguu/core/model/__init__.py @@ -44,16 +44,10 @@ from .user_language import UserLanguage -from .topic import Topic from .user_article import UserArticle from .article_difficulty_feedback import ArticleDifficultyFeedback from .feed import Feed - -from .topic import Topic -from .topic_subscription import TopicSubscription -from .topic_filter import TopicFilter -from .localized_topic import LocalizedTopic from .url_keyword import UrlKeyword from .search import Search diff --git a/zeeguu/core/model/article.py b/zeeguu/core/model/article.py index 49408881..c2b4eb58 100644 --- a/zeeguu/core/model/article.py +++ b/zeeguu/core/model/article.py @@ -89,12 +89,6 @@ class Article(db.Model): uploader_id = Column(Integer, ForeignKey(User.id)) uploader = relationship(User) - from zeeguu.core.model.topic import Topic - - topics = relationship( - Topic, secondary="article_topic_map", backref=backref("articles") - ) - new_topics = relationship("NewArticleTopicMap", back_populates="article") url_keywords = relationship("ArticleUrlKeywordMap", back_populates="article") @@ -164,12 +158,6 @@ def vote_broken(self): # somebody could vote that this article is broken self.broken += 1 - def topics_as_string(self): - topics = "" - for topic in self.topics: - topics += topic.title + " " - return topics - def new_topics_as_string(self): topics = "" for topic in self.new_topics: @@ -243,7 +231,6 @@ def fk_to_cefr(fk_difficulty): title=self.title, summary=summary, language=self.language.code, - topics=self.topics_as_string(), new_topics=self.new_topics_as_string(), new_topics_list=self.new_topics_as_tuple(), video=self.video, @@ -298,9 +285,6 @@ def article_info_for_teacher(self): def is_owned_by(self, user): return self.uploader_id == user.id - def add_topic(self, topic): - self.topics.append(topic) - def add_new_topic(self, new_topic, session, origin_type: TopicOriginType): t = NewArticleTopicMap( diff --git a/zeeguu/core/test/test_article.py b/zeeguu/core/test/test_article.py index 65b68fab..0c2addf2 100644 --- a/zeeguu/core/test/test_article.py +++ b/zeeguu/core/test/test_article.py @@ -5,7 +5,7 @@ import zeeguu.core from zeeguu.core.test.rules.article_rule import ArticleRule from zeeguu.core.test.rules.language_rule import LanguageRule -from zeeguu.core.model import Topic, Article +from zeeguu.core.model import Article from zeeguu.core.test.mocking_the_web import ( URL_CNN_KATHMANDU, URL_SPIEGEL_VENEZUELA, @@ -27,13 +27,6 @@ def test_articles_are_different(self): def test_article_representation_does_not_error(self): assert self.article1.article_info() - def test_add_topic(self): - health = Topic("health") - sports = Topic("sports") - self.article1.add_topic(health) - self.article1.add_topic(sports) - assert len(self.article1.topics) == 2 - def test_find_or_create(self): self.new_art = Article.find_or_create(session, URL_SPIEGEL_VENEZUELA) assert self.new_art.fk_difficulty diff --git a/zeeguu/core/test/test_retrieve_and_compute.py b/zeeguu/core/test/test_retrieve_and_compute.py index e3e4c8cb..c08afc88 100644 --- a/zeeguu/core/test/test_retrieve_and_compute.py +++ b/zeeguu/core/test/test_retrieve_and_compute.py @@ -11,7 +11,6 @@ sufficient_quality, LowQualityTypes, ) -from zeeguu.core.model import Topic, LocalizedTopic from tools.crawl_summary.crawl_report import CrawlReport from zeeguu.core.test.mocking_the_web import * @@ -35,22 +34,6 @@ def testDifficultyOfFeedItems(self): assert len(articles) == 2 assert articles[0].fk_difficulty - def testDownloadWithTopic(self): - feed = FeedRule().feed1 - topic = Topic("Spiegel") - zeeguu.core.model.db.session.add(topic) - zeeguu.core.model.db.session.commit() - loc_topic = LocalizedTopic(topic, self.lan, "spiegelDE", "spiegel") - zeeguu.core.model.db.session.add(loc_topic) - zeeguu.core.model.db.session.commit() - crawl_report = CrawlReport() - crawl_report.add_feed(feed) - download_from_feed(feed, zeeguu.core.model.db.session, crawl_report, 3, False) - - article = feed.get_articles(limit=2)[0] - - assert topic in article.topics - def test_sufficient_quality(self): art = newspaper.Article(URL_PROPUBLICA_INVESTING) art.download() diff --git a/zeeguu/core/test/test_user_article.py b/zeeguu/core/test/test_user_article.py index 14228db3..066b93c6 100644 --- a/zeeguu/core/test/test_user_article.py +++ b/zeeguu/core/test/test_user_article.py @@ -7,7 +7,6 @@ from zeeguu.core.test.rules.language_rule import LanguageRule from zeeguu.core.test.rules.user_article_rule import UserArticleRule from zeeguu.core.test.rules.user_rule import UserRule -from zeeguu.core.model import Topic from zeeguu.core.model.user_article import UserArticle db_session = zeeguu.core.model.db.session From e704c4b8a4a6677d9d6a3fde5a5cff00b6ef230d Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Tue, 12 Nov 2024 13:43:21 +0100 Subject: [PATCH 47/71] Deleting + Renaming Files - Deleted all files pertaining to the "old" topics of Zeeguu - Renamed all "New" files, by removing the new. --- .../migrate_old_topics_to_new_topics.py | 6 +- .../set_new_topics_from_url_keyword.py | 6 +- .../set_topic_mapping_to_keywords.py | 4 +- tools/evaluate_infer_topics.py | 6 +- tools/extract_articles_with_new_topics.py | 6 +- .../migrations/24-11-12-delete_old_topics.sql | 12 ++ tools/mysql_to_elastic_new_topics.py | 24 +--- zeeguu/api/endpoints/article.py | 4 +- zeeguu/api/endpoints/topics.py | 29 +++-- .../elastic_recommender.py | 8 +- .../content_retriever/article_downloader.py | 6 +- zeeguu/core/elastic/indexing.py | 21 ++- zeeguu/core/model/__init__.py | 8 +- zeeguu/core/model/article.py | 17 +-- ...icle_topic_map.py => article_topic_map.py} | 4 +- .../core/model/article_topic_user_feedback.py | 6 +- zeeguu/core/model/localized_topic.py | 70 ---------- zeeguu/core/model/new_topic.py | 123 ------------------ zeeguu/core/model/new_topic_filter.py | 75 ----------- zeeguu/core/model/new_topic_subscription.py | 74 ----------- zeeguu/core/model/topic.py | 50 +++++-- zeeguu/core/model/topic_filter.py | 20 +-- zeeguu/core/model/topic_subscription.py | 21 +-- zeeguu/core/model/url_keyword.py | 10 +- 24 files changed, 149 insertions(+), 461 deletions(-) create mode 100644 tools/migrations/24-11-12-delete_old_topics.sql rename zeeguu/core/model/{new_article_topic_map.py => article_topic_map.py} (84%) delete mode 100644 zeeguu/core/model/localized_topic.py delete mode 100644 zeeguu/core/model/new_topic.py delete mode 100644 zeeguu/core/model/new_topic_filter.py delete mode 100644 zeeguu/core/model/new_topic_subscription.py diff --git a/tools/es_v8_migration/migrate_old_topics_to_new_topics.py b/tools/es_v8_migration/migrate_old_topics_to_new_topics.py index 48f1ba3e..8308c507 100644 --- a/tools/es_v8_migration/migrate_old_topics_to_new_topics.py +++ b/tools/es_v8_migration/migrate_old_topics_to_new_topics.py @@ -7,7 +7,7 @@ import zeeguu.core from zeeguu.api.app import create_app -from zeeguu.core.model import TopicSubscription, NewTopicSubscription, NewTopic +from zeeguu.core.model import TopicSubscription, TopicSubscription, Topic from tqdm import tqdm app = create_app() @@ -46,8 +46,8 @@ old_topic = topic_sub.topic new_topic_id = OLD_TOPIC_TO_NEW_TOPIC_MAP.get(old_topic.id, None) if new_topic_id: - new_topic = NewTopic.find_by_id(new_topic_id) - new_user_sub = NewTopicSubscription.find_or_create(db_session, user, new_topic) + new_topic = Topic.find_by_id(new_topic_id) + new_user_sub = TopicSubscription.find_or_create(db_session, user, new_topic) if VERBOSE: print( f"User {user.id}, was subscribed to '{old_topic.title}' and now is subscribed to: '{new_topic.title}'" diff --git a/tools/es_v8_migration/set_new_topics_from_url_keyword.py b/tools/es_v8_migration/set_new_topics_from_url_keyword.py index f2a385d5..73ae1af1 100644 --- a/tools/es_v8_migration/set_new_topics_from_url_keyword.py +++ b/tools/es_v8_migration/set_new_topics_from_url_keyword.py @@ -11,7 +11,7 @@ Article, ArticleUrlKeywordMap, UrlKeyword, - NewArticleTopicMap, + ArticleTopicMap, ) from tqdm import tqdm @@ -29,9 +29,9 @@ db_session.query(Article.id) .join(ArticleUrlKeywordMap) .join(UrlKeyword) - .join(NewArticleTopicMap, isouter=True) + .join(ArticleTopicMap, isouter=True) .filter(UrlKeyword.new_topic != None) - .filter(NewArticleTopicMap.new_topic_id == None) + .filter(ArticleTopicMap.new_topic_id == None) .all() ) print("Adding topics based on url keywords to articles...") diff --git a/tools/es_v8_migration/set_topic_mapping_to_keywords.py b/tools/es_v8_migration/set_topic_mapping_to_keywords.py index e6a2a16e..7fbddade 100644 --- a/tools/es_v8_migration/set_topic_mapping_to_keywords.py +++ b/tools/es_v8_migration/set_topic_mapping_to_keywords.py @@ -1,6 +1,6 @@ import pandas as pd from zeeguu.core.model.url_keyword import UrlKeyword -from zeeguu.core.model.new_topic import NewTopic +from zeeguu.core.model.topic import Topic import zeeguu.core from tqdm import tqdm from zeeguu.api.app import create_app @@ -23,7 +23,7 @@ url_k_list = UrlKeyword.find_all_by_keyword(keyword) for url_k in url_k_list: topic_to_assign = ( - NewTopic.find_by_id(row["val_pred"]) if row["val_pred"] != -1 else None + Topic.find_by_id(row["val_pred"]) if row["val_pred"] != -1 else None ) url_k.new_topic = topic_to_assign db_session.add(url_k) diff --git a/tools/evaluate_infer_topics.py b/tools/evaluate_infer_topics.py index 86c3dba8..f67f7ca6 100644 --- a/tools/evaluate_infer_topics.py +++ b/tools/evaluate_infer_topics.py @@ -2,7 +2,7 @@ add_topics_based_on_semantic_hood_search, ) -from zeeguu.core.model import Article, Language, NewArticleTopicMap +from zeeguu.core.model import Article, Language, ArticleTopicMap from sklearn.metrics import classification_report from zeeguu.core.elastic.settings import ES_CONN_STRING, ES_ZINDEX @@ -34,9 +34,9 @@ ALL_IDS = [ a.article_id - for a in NewArticleTopicMap.query.join(Article) + for a in ArticleTopicMap.query.join(Article) .filter(Article.language != Language.find_by_id(19)) - .filter(NewArticleTopicMap.origin_type != 3) + .filter(ArticleTopicMap.origin_type != 3) .all() ] diff --git a/tools/extract_articles_with_new_topics.py b/tools/extract_articles_with_new_topics.py index efb5c8f0..21a80399 100644 --- a/tools/extract_articles_with_new_topics.py +++ b/tools/extract_articles_with_new_topics.py @@ -1,6 +1,6 @@ from zeeguu.core.model import Article from zeeguu.api.app import create_app -from zeeguu.core.model.new_article_topic_map import NewArticleTopicMap, TopicOriginType +from zeeguu.core.model.article_topic_map import ArticleTopicMap, TopicOriginType import numpy as np from tqdm import tqdm @@ -17,8 +17,8 @@ app.app_context().push() articles = ( - Article.query.join(NewArticleTopicMap) - .filter(NewArticleTopicMap.origin_type != TopicOriginType.INFERRED) + Article.query.join(ArticleTopicMap) + .filter(ArticleTopicMap.origin_type != TopicOriginType.INFERRED) .all() ) diff --git a/tools/migrations/24-11-12-delete_old_topics.sql b/tools/migrations/24-11-12-delete_old_topics.sql new file mode 100644 index 00000000..4d466cd7 --- /dev/null +++ b/tools/migrations/24-11-12-delete_old_topics.sql @@ -0,0 +1,12 @@ +/* + Removing the Old Topics, and renaming the new tables. + */ +DROP TABLE `zeeguu_test`.`topic_filter`; + +DROP TABLE `zeeguu_test`.`topic_subscription`; + +DROP TABLE `zeeguu_test`.`article_topic_map`; + +DROP TABLE `zeeguu_test`.`localized_topic`; + +DROP TABLE `zeeguu_test`.`topic`; \ No newline at end of file diff --git a/tools/mysql_to_elastic_new_topics.py b/tools/mysql_to_elastic_new_topics.py index 6ae0ccdd..e0e67dd4 100644 --- a/tools/mysql_to_elastic_new_topics.py +++ b/tools/mysql_to_elastic_new_topics.py @@ -13,10 +13,9 @@ from sqlalchemy.orm.exc import NoResultFound from zeeguu.api.app import create_app -from zeeguu.core.model import Topic, NewArticleTopicMap -from zeeguu.core.model.article import article_topic_map +from zeeguu.core.model import Topic, ArticleTopicMap from zeeguu.core.elastic.settings import ES_ZINDEX, ES_CONN_STRING -from zeeguu.core.model.new_article_topic_map import TopicOriginType +from zeeguu.core.model.article_topic_map import TopicOriginType import numpy as np from tqdm import tqdm @@ -46,19 +45,6 @@ print(es.info()) -def find_topics(article_id, session): - article_topic = ( - session.query(Topic) - .join(article_topic_map) - .filter(article_topic_map.c.article_id == article_id) - ) - topics = "" - for t in article_topic: - topics = topics + str(t.title) + " " - - return topics.rstrip() - - def main(): if DELETE_INDEX: try: @@ -97,9 +83,9 @@ def gen_docs(articles_w_topics): [ a_id[0] for a_id in db_session.query(Article.id) - .join(NewArticleTopicMap) + .join(ArticleTopicMap) .filter( - NewArticleTopicMap.origin_type != TopicOriginType.INFERRED + ArticleTopicMap.origin_type != TopicOriginType.INFERRED ) # Do not index Inferred topics .filter(Article.broken != 1) # Filter out documents that are broken # .filter(Article.language_id == 2) If only one language @@ -112,7 +98,7 @@ def gen_docs(articles_w_topics): [ art_id_w_topic[0] for art_id_w_topic in db_session.query( - NewArticleTopicMap.article_id + ArticleTopicMap.article_id ).distinct() ] ) diff --git a/zeeguu/api/endpoints/article.py b/zeeguu/api/endpoints/article.py index a5256dea..1e9f28af 100644 --- a/zeeguu/api/endpoints/article.py +++ b/zeeguu/api/endpoints/article.py @@ -1,6 +1,6 @@ import flask from flask import request -from zeeguu.core.model import Article, Language, User, NewTopic +from zeeguu.core.model import Article, Language, User, Topic from zeeguu.core.model.article_topic_user_feedback import ArticleTopicUserFeedback from zeeguu.api.utils import json_result from zeeguu.core.model.personal_copy import PersonalCopy @@ -126,7 +126,7 @@ def remove_ml_suggestion(): article_id = request.form.get("article_id", "") new_topic = request.form.get("new_topic", "") article = Article.find_by_id(article_id) - new_topic = NewTopic.find(new_topic) + new_topic = Topic.find(new_topic) try: ArticleTopicUserFeedback.find_or_create( db_session, diff --git a/zeeguu/api/endpoints/topics.py b/zeeguu/api/endpoints/topics.py index e33da2d4..f2bcdcc7 100644 --- a/zeeguu/api/endpoints/topics.py +++ b/zeeguu/api/endpoints/topics.py @@ -2,9 +2,9 @@ from zeeguu.logging import log from flask import request from zeeguu.core.model import ( - NewTopic, - NewTopicSubscription, - NewTopicFilter, + Topic, + TopicSubscription, + TopicFilter, Language, User, ) @@ -24,6 +24,7 @@ UNFILTER_NEW_TOPIC = "unfilter_new_topic" FILTERED_NEW_TOPICS = "filtered_new_topics" + # --------------------------------------------------------------------------- @api.route(f"/{SUBSCRIBE_NEW_TOPIC}", methods=("POST",)) # --------------------------------------------------------------------------- @@ -38,9 +39,9 @@ def subscribe_to_new_topic_with_id(): """ new_topic_id = int(request.form.get("new_topic_id", "")) - topic_object = NewTopic.find_by_id(new_topic_id) + topic_object = Topic.find_by_id(new_topic_id) user = User.find_by_id(flask.g.user_id) - NewTopicSubscription.find_or_create(db_session, user, topic_object) + TopicSubscription.find_or_create(db_session, user, topic_object) db_session.commit() return "OK" @@ -60,7 +61,7 @@ def unsubscribe_from_new_topic(): new_topic_id = int(request.form.get("new_topic_id", "")) user = User.find_by_id(flask.g.user_id) try: - to_delete = NewTopicSubscription.with_topic_id(new_topic_id, user) + to_delete = TopicSubscription.with_topic_id(new_topic_id, user) db_session.delete(to_delete) db_session.commit() except Exception as e: @@ -88,7 +89,7 @@ def get_subscribed_new_topics(): title = """ user = User.find_by_id(flask.g.user_id) - subscriptions = NewTopicSubscription.all_for_user(user) + subscriptions = TopicSubscription.all_for_user(user) topic_list = [] for sub in subscriptions: try: @@ -101,6 +102,7 @@ def get_subscribed_new_topics(): return json_result(topic_list) + # --------------------------------------------------------------------------- @api.route("/available_new_topics", methods=("GET",)) # --------------------------------------------------------------------------- @@ -119,10 +121,10 @@ def get_available_new_topics(): topic_data = [] user = User.find_by_id(flask.g.user_id) already_subscribed = [ - each.new_topic.id for each in NewTopicSubscription.all_for_user(user) + each.new_topic.id for each in TopicSubscription.all_for_user(user) ] user_learning_language = Language.find_by_id(user.learned_language_id) - topics = NewTopic.get_all_topics(user_learning_language) + topics = Topic.get_all_topics(user_learning_language) for topic in topics: if topic.id not in already_subscribed: @@ -146,9 +148,9 @@ def subscribe_to_new_filter_with_id(): filter_id = int(request.form.get("filter_id", "")) - filter_object = NewTopic.find_by_id(filter_id) + filter_object = Topic.find_by_id(filter_id) user = User.find_by_id(flask.g.user_id) - NewTopicFilter.find_or_create(db_session, user, filter_object) + TopicFilter.find_or_create(db_session, user, filter_object) return "OK" @@ -167,7 +169,7 @@ def unsubscribe_from_new_filter(): filter_id = int(request.form.get("new_topic_id", "")) try: - to_delete = NewTopicFilter.with_topic_id(filter_id, user) + to_delete = TopicFilter.with_topic_id(filter_id, user) db_session.delete(to_delete) db_session.commit() except Exception as e: @@ -195,7 +197,7 @@ def get_subscribed_new_filters(): title = """ user = User.find_by_id(flask.g.user_id) - filters = NewTopicFilter.all_for_user(user) + filters = TopicFilter.all_for_user(user) filter_list = [] for fil in filters: try: @@ -207,4 +209,3 @@ def get_subscribed_new_filters(): log(str(e)) return json_result(filter_list) - diff --git a/zeeguu/core/content_recommender/elastic_recommender.py b/zeeguu/core/content_recommender/elastic_recommender.py index 99561617..3812f7cb 100644 --- a/zeeguu/core/content_recommender/elastic_recommender.py +++ b/zeeguu/core/content_recommender/elastic_recommender.py @@ -14,8 +14,8 @@ from zeeguu.core.model import ( Article, - NewTopicFilter, - NewTopicSubscription, + TopicFilter, + TopicSubscription, SearchFilter, SearchSubscription, UserArticle, @@ -53,7 +53,7 @@ def _prepare_user_constraints(user): # 2. New Topics to exclude / filter out # ================================= - excluded_new_topics = NewTopicFilter.all_for_user(user) + excluded_new_topics = TopicFilter.all_for_user(user) new_topics_to_exclude = [ each.new_topic.title for each in excluded_new_topics if each is not None ] @@ -61,7 +61,7 @@ def _prepare_user_constraints(user): # 3. New Topics subscribed, and thus to include # ========================================= - topic_new_subscriptions = NewTopicSubscription.all_for_user(user) + topic_new_subscriptions = TopicSubscription.all_for_user(user) new_topics_to_include = [ subscription.new_topic.title for subscription in topic_new_subscriptions diff --git a/zeeguu/core/content_retriever/article_downloader.py b/zeeguu/core/content_retriever/article_downloader.py index 58b70985..1c8a703b 100644 --- a/zeeguu/core/content_retriever/article_downloader.py +++ b/zeeguu/core/content_retriever/article_downloader.py @@ -25,8 +25,8 @@ from zeeguu.core.content_quality.quality_filter import sufficient_quality from zeeguu.core.content_cleaning import cleanup_text_w_crawl_report from zeeguu.core.emailer.zeeguu_mailer import ZeeguuMailer -from zeeguu.core.model import Url, Feed, UrlKeyword, NewTopic -from zeeguu.core.model.new_article_topic_map import TopicOriginType +from zeeguu.core.model import Url, Feed, UrlKeyword, Topic +from zeeguu.core.model.article_topic_map import TopicOriginType import requests from zeeguu.core.model.article import MAX_CHAR_COUNT_IN_SUMMARY @@ -348,7 +348,7 @@ def add_new_topics(new_article, feed, url_keywords, session): # Handle Hard coded Feeds if feed.id in HARDCODED_FEEDS: print("Used HARDCODED feed") - topic = NewTopic.find_by_id(HARDCODED_FEEDS[feed.id]) + topic = Topic.find_by_id(HARDCODED_FEEDS[feed.id]) new_article.add_new_topic(topic, session, TopicOriginType.HARDSET.value) session.add(new_article) return TopicOriginType.HARDSET.value, [topic.title] diff --git a/zeeguu/core/elastic/indexing.py b/zeeguu/core/elastic/indexing.py index b407a3c8..5c6710ab 100644 --- a/zeeguu/core/elastic/indexing.py +++ b/zeeguu/core/elastic/indexing.py @@ -1,7 +1,6 @@ -from zeeguu.core.model import UrlKeyword, NewTopic -from zeeguu.core.model.article import article_topic_map +from zeeguu.core.model import UrlKeyword, Topic from zeeguu.core.model.article_url_keyword_map import ArticleUrlKeywordMap -from zeeguu.core.model.new_article_topic_map import TopicOriginType, NewArticleTopicMap +from zeeguu.core.model.article_topic_map import TopicOriginType, ArticleTopicMap from zeeguu.core.model.difficulty_lingo_rank import DifficultyLingoRank from elasticsearch import Elasticsearch from zeeguu.core.elastic.settings import ES_CONN_STRING, ES_ZINDEX @@ -10,17 +9,17 @@ def find_new_topics(article_id, session): article_topics = ( - session.query(NewTopic) - .join(NewArticleTopicMap) - .filter(NewArticleTopicMap.article_id == article_id) - .filter(NewArticleTopicMap.origin_type != TopicOriginType.INFERRED.value) + session.query(Topic) + .join(ArticleTopicMap) + .filter(ArticleTopicMap.article_id == article_id) + .filter(ArticleTopicMap.origin_type != TopicOriginType.INFERRED.value) .all() ) inferred_article_topics = ( - session.query(NewTopic) - .join(NewArticleTopicMap) - .filter(NewArticleTopicMap.article_id == article_id) - .filter(NewArticleTopicMap.origin_type == TopicOriginType.INFERRED.value) + session.query(Topic) + .join(ArticleTopicMap) + .filter(ArticleTopicMap.article_id == article_id) + .filter(ArticleTopicMap.origin_type == TopicOriginType.INFERRED.value) .all() ) return article_topics, inferred_article_topics diff --git a/zeeguu/core/model/__init__.py b/zeeguu/core/model/__init__.py index 5966ebab..8765111e 100644 --- a/zeeguu/core/model/__init__.py +++ b/zeeguu/core/model/__init__.py @@ -25,7 +25,7 @@ # the core model from .article_url_keyword_map import ArticleUrlKeywordMap -from .new_article_topic_map import NewArticleTopicMap +from .article_topic_map import ArticleTopicMap from .user_cohort_map import UserCohortMap from .language import Language from .url import Url @@ -70,9 +70,9 @@ from .cohort_article_map import CohortArticleMap # New topic features -from .new_topic import NewTopic -from .new_topic_subscription import NewTopicSubscription -from .new_topic_filter import NewTopicFilter +from .topic import Topic +from .topic_subscription import TopicSubscription +from .topic_filter import TopicFilter from .user_reading_session import UserReadingSession from .user_exercise_session import UserExerciseSession diff --git a/zeeguu/core/model/article.py b/zeeguu/core/model/article.py index c2b4eb58..bd0b7981 100644 --- a/zeeguu/core/model/article.py +++ b/zeeguu/core/model/article.py @@ -8,22 +8,15 @@ from sqlalchemy import Column, Integer, String, ForeignKey, DateTime, UnicodeText, Table from sqlalchemy.orm import relationship, backref from sqlalchemy.orm.exc import NoResultFound -from zeeguu.core.model.new_article_topic_map import TopicOriginType +from zeeguu.core.model.article_topic_map import TopicOriginType from zeeguu.core.language.difficulty_estimator_factory import DifficultyEstimatorFactory from zeeguu.core.model.article_url_keyword_map import ArticleUrlKeywordMap -from zeeguu.core.model.new_article_topic_map import NewArticleTopicMap +from zeeguu.core.model.article_topic_map import ArticleTopicMap from zeeguu.core.util.encoding import datetime_to_json from zeeguu.core.model import db -article_topic_map = Table( - "article_topic_map", - db.Model.metadata, - Column("article_id", Integer, ForeignKey("article.id")), - Column("topic_id", Integer, ForeignKey("topic.id")), -) - MAX_CHAR_COUNT_IN_SUMMARY = 300 MARKED_BROKEN_DUE_TO_LOW_QUALITY = 100 @@ -89,7 +82,7 @@ class Article(db.Model): uploader_id = Column(Integer, ForeignKey(User.id)) uploader = relationship(User) - new_topics = relationship("NewArticleTopicMap", back_populates="article") + new_topics = relationship("ArticleTopicMap", back_populates="article") url_keywords = relationship("ArticleUrlKeywordMap", back_populates="article") # Few words in an article is very often not an @@ -287,9 +280,7 @@ def is_owned_by(self, user): def add_new_topic(self, new_topic, session, origin_type: TopicOriginType): - t = NewArticleTopicMap( - article=self, new_topic=new_topic, origin_type=origin_type - ) + t = ArticleTopicMap(article=self, new_topic=new_topic, origin_type=origin_type) session.add(t) def set_new_topics(self, topics, session): diff --git a/zeeguu/core/model/new_article_topic_map.py b/zeeguu/core/model/article_topic_map.py similarity index 84% rename from zeeguu/core/model/new_article_topic_map.py rename to zeeguu/core/model/article_topic_map.py index 874bfacb..45d8e037 100644 --- a/zeeguu/core/model/new_article_topic_map.py +++ b/zeeguu/core/model/article_topic_map.py @@ -10,11 +10,11 @@ class TopicOriginType(IntEnum): INFERRED = 3 -class NewArticleTopicMap(db.Model): +class ArticleTopicMap(db.Model): __tablename__ = "new_article_topic_map" # Constants used for origin_type article_id = Column(ForeignKey("article.id"), primary_key=True) new_topic_id = Column(ForeignKey("new_topic.id"), primary_key=True) origin_type = Column(Integer) article = relationship("Article", back_populates="new_topics") - new_topic = relationship("NewTopic", back_populates="articles") + new_topic = relationship("Topic", back_populates="articles") diff --git a/zeeguu/core/model/article_topic_user_feedback.py b/zeeguu/core/model/article_topic_user_feedback.py index 46c7eb3f..c2affaaf 100644 --- a/zeeguu/core/model/article_topic_user_feedback.py +++ b/zeeguu/core/model/article_topic_user_feedback.py @@ -1,6 +1,6 @@ from sqlalchemy import UniqueConstraint from sqlalchemy.orm import relationship -from zeeguu.core.model.new_topic import NewTopic +from zeeguu.core.model.topic import Topic from zeeguu.core.model.user import User from zeeguu.core.model.article import Article import sqlalchemy @@ -30,8 +30,8 @@ class ArticleTopicUserFeedback(db.Model): user_id = db.Column(db.Integer, db.ForeignKey(User.id)) user = relationship(User) - new_topic_id = db.Column(db.Integer, db.ForeignKey(NewTopic.id)) - new_topic = relationship(NewTopic) + new_topic_id = db.Column(db.Integer, db.ForeignKey(Topic.id)) + new_topic = relationship(Topic) feedback = db.Column(db.String(50)) diff --git a/zeeguu/core/model/localized_topic.py b/zeeguu/core/model/localized_topic.py deleted file mode 100644 index 28f9fee2..00000000 --- a/zeeguu/core/model/localized_topic.py +++ /dev/null @@ -1,70 +0,0 @@ -from sqlalchemy.orm import relationship - -import zeeguu.core - -from sqlalchemy import Column, Integer, String, ForeignKey, and_ - -from zeeguu.core.model import db - - -class LocalizedTopic(db.Model): - """ - - A localized topic is a localized version of a topic, - it is the same topic but translated and with - the added language_id and localized keywords. - - """ - - __table_args__ = {"mysql_collate": "utf8_bin"} - - id = Column(Integer, primary_key=True) - - from zeeguu.core.model.topic import Topic - - topic_id = Column(Integer, ForeignKey(Topic.id)) - topic = relationship(Topic) - - from zeeguu.core.model.language import Language - - language_id = Column(Integer, ForeignKey(Language.id)) - language = relationship(Language) - - topic_translated = Column(String(30)) - - keywords = Column(String(1024)) - - def __init__( - self, - topic: Topic, - language: Language, - topic_translated: str, - keywords: str = "", - ): - self.topic = topic - self.language = language - self.topic_translated = topic_translated - self.keywords = keywords - - def __repr__(self): - return f"" - - def matches_article(self, article): - keywords = self.keywords.strip().split(" ") - - for keyword in keywords: - if keyword != "" and ( - keyword in article.url.as_string() or keyword in article.title - ): - return True - - return False - - def all_articles(self): - from zeeguu.core.model import Article - - return Article.query.filter(Article.topics.any(id=self.topic_id)).all() - - @classmethod - def all_for_language(cls, language): - return (cls.query.filter(cls.language == language)).all() diff --git a/zeeguu/core/model/new_topic.py b/zeeguu/core/model/new_topic.py deleted file mode 100644 index 2a4a9ac7..00000000 --- a/zeeguu/core/model/new_topic.py +++ /dev/null @@ -1,123 +0,0 @@ -from zeeguu.logging import logp - -from sqlalchemy import Column, Integer, String -from sqlalchemy.orm import relationship -from zeeguu.core.model import db -from zeeguu.core.model.language import Language -from zeeguu.core.model.new_article_topic_map import NewArticleTopicMap -from zeeguu.core.util.time import get_server_time_utc - - -class NewTopic(db.Model): - """ - The New Topics are standerdized accross all languages. - - Each UrlKeyword can be associated with one New Topic - which are used to infer topics in articles which haven't got any topic. - - This relationship is stored in NewArticleTopicMap. - """ - - __table_args__ = {"mysql_collate": "utf8_bin"} - __tablename__ = "new_topic" - - id = Column(Integer, primary_key=True) - - title = Column(String(64)) - articles = relationship("NewArticleTopicMap", back_populates="new_topic") - language_topic_available_cache = {} - - def __init__(self, title): - self.title = title - - def __repr__(self): - return f"" - - def as_dictionary(self): - - return dict( - id=self.id, - title=self.title, - ) - - def all_articles(self, limit=2000): - - from zeeguu.core.model import Article - - if hasattr(NewTopic, "cached_articles") and ( - self.cached_articles.get(self.id, None) - ): - logp(f"Topic: getting the cached articles for topic: {self.title}") - all_ids = NewTopic.cached_articles[self.id] - return Article.query.filter(Article.id.in_(all_ids)).all() - - if not hasattr(NewTopic, "cached_articles"): - NewTopic.cached_articles = {} - - logp("computing and caching the articles for topic: " + self.title) - NewTopic.cached_articles[self.id] = [ - each.id - for each in Article.query.order_by(Article.published_time.desc()) - .filter(Article.topics.any(id=self.id)) - .limit(limit) - ] - - all_ids = NewTopic.cached_articles[self.id] - return Article.query.filter(Article.id.in_(all_ids)).all() - - def clear_all_articles_cache(self): - NewTopic.cached_articles[self.id] = None - - @classmethod - def find(cls, name: str): - try: - return cls.query.filter(cls.title == name).one() - except Exception as e: - from sentry_sdk import capture_exception - - capture_exception(e) - return None - - @classmethod - def find_by_id(cls, i): - try: - result = cls.query.filter(cls.id == i).one() - return result - except Exception as e: - from sentry_sdk import capture_exception - - capture_exception(e) - return None - - @classmethod - def get_all_topics(cls, language: Language = None): - from zeeguu.core.model.article import Article - - def update_available_topic_cache(): - topics_for_language = ( - NewTopic.query.join(NewArticleTopicMap) - .join(Article) - .filter(Article.language_id == language.id) - .distinct(NewTopic.id) - .all() - ) - cls.language_topic_available_cache[language.id] = ( - topics_for_language, - get_server_time_utc(), - ) - - if language is None: - return NewTopic.query.order_by(NewTopic.title).all() - topics_available, last_check = cls.language_topic_available_cache.get( - language.id, (None, None) - ) - - if last_check is None: - update_available_topic_cache() - else: - time_since_last_check = get_server_time_utc() - last_check - if time_since_last_check.days > 7: - update_available_topic_cache() - - topics_available = cls.language_topic_available_cache[language.id][0] - return topics_available diff --git a/zeeguu/core/model/new_topic_filter.py b/zeeguu/core/model/new_topic_filter.py deleted file mode 100644 index 3b3c8164..00000000 --- a/zeeguu/core/model/new_topic_filter.py +++ /dev/null @@ -1,75 +0,0 @@ -from sqlalchemy import UniqueConstraint -from sqlalchemy.orm import relationship - -from zeeguu.core.model.new_topic import NewTopic -from zeeguu.core.model.user import User -import sqlalchemy - -import zeeguu.core - -from zeeguu.core.model import db - - -class NewTopicFilter(db.Model): - """ - - A topic filter is created when the user - wants to filter out a particular topic. - This is then taken into account in the - mixed recomemnder, when retrieving articles. - - """ - - __table_args__ = {"mysql_collate": "utf8_bin"} - __tablename__ = "new_topic_filter" - - id = db.Column(db.Integer, primary_key=True) - - user_id = db.Column(db.Integer, db.ForeignKey(User.id)) - user = relationship(User) - - new_topic_id = db.Column(db.Integer, db.ForeignKey(NewTopic.id)) - new_topic = relationship(NewTopic) - - UniqueConstraint(user_id, new_topic_id) - - def __init__(self, user, topic): - self.user = user - self.new_topic = topic - - def __str__(self): - return f"Topic filter ({self.user.name}, {self.new_topic})" - - __repr__ = __str__ - - @classmethod - def find_or_create(cls, session, user, topic): - try: - return ( - cls.query.filter(cls.user == user).filter(cls.new_topic == topic).one() - ) - except sqlalchemy.orm.exc.NoResultFound: - new = cls(user, topic) - session.add(new) - session.commit() - return new - - @classmethod - def all_for_user(cls, user): - return cls.query.filter(cls.user == user).all() - - @classmethod - def all_for_user_as_list(cls, user): - return [topic_id for topic_id in cls.all_for_user()] - - @classmethod - def with_id(cls, i): - return (cls.query.filter(cls.id == i)).one() - - @classmethod - def with_topic_id(cls, i, user): - return ( - (cls.query.filter(cls.new_topic_id == i)) - .filter(cls.user_id == user.id) - .one() - ) diff --git a/zeeguu/core/model/new_topic_subscription.py b/zeeguu/core/model/new_topic_subscription.py deleted file mode 100644 index 26d1062d..00000000 --- a/zeeguu/core/model/new_topic_subscription.py +++ /dev/null @@ -1,74 +0,0 @@ -from sqlalchemy import UniqueConstraint -from sqlalchemy.orm import relationship - -from zeeguu.core.model.new_topic import NewTopic -from zeeguu.core.model.user import User -import sqlalchemy - -import zeeguu.core - -from zeeguu.core.model import db - - -class NewTopicSubscription(db.Model): - """ - - A topic subscription is created when - the user subscribed to a particular topic. - This is then taken into account in the - mixed recomemmder, when retrieving articles. - - """ - - __table_args__ = {"mysql_collate": "utf8_bin"} - __tablename__ = "new_topic_subscription" - - id = db.Column(db.Integer, primary_key=True) - - user_id = db.Column(db.Integer, db.ForeignKey(User.id)) - user = relationship(User) - - new_topic_id = db.Column(db.Integer, db.ForeignKey(NewTopic.id)) - new_topic = relationship(NewTopic) - - UniqueConstraint(user_id, new_topic_id) - - def __init__(self, user, topic): - self.user = user - self.new_topic = topic - - def __str__(self): - return f"Topic subscription ({self.user.name}, {self.new_topic})" - - __repr__ = __str__ - - @classmethod - def find_or_create(cls, session, user, topic): - try: - return ( - cls.query.filter(cls.user == user).filter(cls.new_topic == topic).one() - ) - except sqlalchemy.orm.exc.NoResultFound: - new = cls(user, topic) - session.add(new) - return new - - @classmethod - def all_for_user(cls, user): - return cls.query.filter(cls.user == user).all() - - @classmethod - def all_for_user_as_list(cls, user): - return [topic_id for topic_id in cls.query.filter(cls.user == user).all()] - - @classmethod - def with_id(cls, i): - return (cls.query.filter(cls.id == i)).one() - - @classmethod - def with_topic_id(cls, i, user): - return ( - (cls.query.filter(cls.new_topic_id == i)) - .filter(cls.user_id == user.id) - .one() - ) diff --git a/zeeguu/core/model/topic.py b/zeeguu/core/model/topic.py index 7e8f8a4f..28c3df5e 100644 --- a/zeeguu/core/model/topic.py +++ b/zeeguu/core/model/topic.py @@ -1,26 +1,31 @@ from zeeguu.logging import logp from sqlalchemy import Column, Integer, String - -import zeeguu - +from sqlalchemy.orm import relationship from zeeguu.core.model import db +from zeeguu.core.model.language import Language +from zeeguu.core.model.article_topic_map import ArticleTopicMap +from zeeguu.core.util.time import get_server_time_utc class Topic(db.Model): """ + The New Topics are standerdized accross all languages. - A topic is the general (English) name of a topic, - the localized_topic contains the language, translation, - and the keywords used to find the articles. + Each UrlKeyword can be associated with one New Topic + which are used to infer topics in articles which haven't got any topic. + This relationship is stored in ArticleTopicMap. """ __table_args__ = {"mysql_collate": "utf8_bin"} + __tablename__ = "new_topic" id = Column(Integer, primary_key=True) title = Column(String(64)) + articles = relationship("ArticleTopicMap", back_populates="new_topic") + language_topic_available_cache = {} def __init__(self, title): self.title = title @@ -85,5 +90,34 @@ def find_by_id(cls, i): return None @classmethod - def get_all_topics(cls): - return Topic.query.order_by(Topic.title).all() + def get_all_topics(cls, language: Language = None): + from zeeguu.core.model.article import Article + + def update_available_topic_cache(): + topics_for_language = ( + Topic.query.join(ArticleTopicMap) + .join(Article) + .filter(Article.language_id == language.id) + .distinct(Topic.id) + .all() + ) + cls.language_topic_available_cache[language.id] = ( + topics_for_language, + get_server_time_utc(), + ) + + if language is None: + return Topic.query.order_by(Topic.title).all() + topics_available, last_check = cls.language_topic_available_cache.get( + language.id, (None, None) + ) + + if last_check is None: + update_available_topic_cache() + else: + time_since_last_check = get_server_time_utc() - last_check + if time_since_last_check.days > 7: + update_available_topic_cache() + + topics_available = cls.language_topic_available_cache[language.id][0] + return topics_available diff --git a/zeeguu/core/model/topic_filter.py b/zeeguu/core/model/topic_filter.py index c1d491bd..aea89670 100644 --- a/zeeguu/core/model/topic_filter.py +++ b/zeeguu/core/model/topic_filter.py @@ -21,31 +21,33 @@ class TopicFilter(db.Model): """ __table_args__ = {"mysql_collate": "utf8_bin"} - __tablename__ = "topic_filter" + __tablename__ = "new_topic_filter" id = db.Column(db.Integer, primary_key=True) user_id = db.Column(db.Integer, db.ForeignKey(User.id)) user = relationship(User) - topic_id = db.Column(db.Integer, db.ForeignKey(Topic.id)) - topic = relationship(Topic) + new_topic_id = db.Column(db.Integer, db.ForeignKey(Topic.id)) + new_topic = relationship(Topic) - UniqueConstraint(user_id, topic_id) + UniqueConstraint(user_id, new_topic_id) def __init__(self, user, topic): self.user = user - self.topic = topic + self.new_topic = topic def __str__(self): - return f"Topic filter ({self.user.name}, {self.topic})" + return f"Topic filter ({self.user.name}, {self.new_topic})" __repr__ = __str__ @classmethod def find_or_create(cls, session, user, topic): try: - return cls.query.filter(cls.user == user).filter(cls.topic == topic).one() + return ( + cls.query.filter(cls.user == user).filter(cls.new_topic == topic).one() + ) except sqlalchemy.orm.exc.NoResultFound: new = cls(user, topic) session.add(new) @@ -67,5 +69,7 @@ def with_id(cls, i): @classmethod def with_topic_id(cls, i, user): return ( - (cls.query.filter(cls.topic_id == i)).filter(cls.user_id == user.id).one() + (cls.query.filter(cls.new_topic_id == i)) + .filter(cls.user_id == user.id) + .one() ) diff --git a/zeeguu/core/model/topic_subscription.py b/zeeguu/core/model/topic_subscription.py index 9406484d..76115bd6 100644 --- a/zeeguu/core/model/topic_subscription.py +++ b/zeeguu/core/model/topic_subscription.py @@ -21,35 +21,36 @@ class TopicSubscription(db.Model): """ __table_args__ = {"mysql_collate": "utf8_bin"} - __tablename__ = "topic_subscription" + __tablename__ = "new_topic_subscription" id = db.Column(db.Integer, primary_key=True) user_id = db.Column(db.Integer, db.ForeignKey(User.id)) user = relationship(User) - topic_id = db.Column(db.Integer, db.ForeignKey(Topic.id)) - topic = relationship(Topic) + new_topic_id = db.Column(db.Integer, db.ForeignKey(Topic.id)) + new_topic = relationship(Topic) - UniqueConstraint(user_id, topic_id) + UniqueConstraint(user_id, new_topic_id) def __init__(self, user, topic): self.user = user - self.topic = topic + self.new_topic = topic def __str__(self): - return f"Topic subscription ({self.user.name}, {self.topic})" + return f"Topic subscription ({self.user.name}, {self.new_topic})" __repr__ = __str__ @classmethod def find_or_create(cls, session, user, topic): try: - return cls.query.filter(cls.user == user).filter(cls.topic == topic).one() + return ( + cls.query.filter(cls.user == user).filter(cls.new_topic == topic).one() + ) except sqlalchemy.orm.exc.NoResultFound: new = cls(user, topic) session.add(new) - session.commit() return new @classmethod @@ -67,5 +68,7 @@ def with_id(cls, i): @classmethod def with_topic_id(cls, i, user): return ( - (cls.query.filter(cls.topic_id == i)).filter(cls.user_id == user.id).one() + (cls.query.filter(cls.new_topic_id == i)) + .filter(cls.user_id == user.id) + .one() ) diff --git a/zeeguu/core/model/url_keyword.py b/zeeguu/core/model/url_keyword.py index 96d48097..82e68845 100644 --- a/zeeguu/core/model/url_keyword.py +++ b/zeeguu/core/model/url_keyword.py @@ -1,7 +1,7 @@ from sqlalchemy.orm import relationship from zeeguu.core.model.url import Url from zeeguu.core.model.language import Language -from zeeguu.core.model.new_topic import NewTopic +from zeeguu.core.model.topic import Topic from zeeguu.core.util import remove_duplicates_keeping_order import sqlalchemy import string @@ -59,13 +59,13 @@ class UrlKeyword(db.Model): language_id = db.Column(db.Integer, db.ForeignKey(Language.id)) language = relationship(Language) - new_topic_id = db.Column(db.Integer, db.ForeignKey(NewTopic.id)) - new_topic = relationship(NewTopic) + new_topic_id = db.Column(db.Integer, db.ForeignKey(Topic.id)) + new_topic = relationship(Topic) keyword = db.Column(db.String(45)) articles = relationship("ArticleUrlKeywordMap", back_populates="url_keyword") - def __init__(self, keyword: str, language: Language, new_topic: NewTopic = None): + def __init__(self, keyword: str, language: Language, new_topic: Topic = None): self.language = language self.new_topic = new_topic @@ -81,7 +81,7 @@ def get_keyword(self): @classmethod def find_or_create( - cls, session, keyword, language: Language, new_topic: NewTopic = None + cls, session, keyword, language: Language, new_topic: Topic = None ): try: return ( From 2a0d223f47234b5f7e0c2c717aab8cf027c2472d Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Tue, 12 Nov 2024 14:01:50 +0100 Subject: [PATCH 48/71] Remove old topics from topic generation --- tools/report_generator/data_extractor.py | 34 ---------------- tools/report_generator/generate_report.py | 49 +++-------------------- 2 files changed, 5 insertions(+), 78 deletions(-) diff --git a/tools/report_generator/data_extractor.py b/tools/report_generator/data_extractor.py index b004494b..5a34dda5 100644 --- a/tools/report_generator/data_extractor.py +++ b/tools/report_generator/data_extractor.py @@ -23,19 +23,6 @@ def run_query(self, query): df = pd.read_sql(query, con=self.db_connection) return df - def get_article_topics_df(self, feed_df): - print("Getting Article Topics...") - query = f"""SELECT a.id, l.name Language, a.feed_id, t.title Topic - FROM article a - INNER JOIN article_topic_map atm on a.id = atm.article_id - INNER JOIN topic t ON atm.topic_id = t.id - INNER JOIN language l ON l.id = a.language_id - WHERE DATEDIFF(CURDATE(), a.published_time) <= {self.DAYS_FOR_REPORT} - AND a.broken = 0""" - df = pd.read_sql(query, con=self.db_connection) - self.__add_feed_name(df, feed_df) - return df - def get_article_new_topics_df(self, feed_df): print("Getting Article New Topics...") query = f"""SELECT a.id, l.name Language, a.feed_id, t.title Topic, atm.origin_type @@ -273,27 +260,6 @@ def get_combined_user_reading_exercise_activity( ] return active_users_reading_or_exercises - def get_topic_reading_time(self): - print("Getting Topic Reading Times...") - query = f"""SELECT l.name as Language, t.title Topic, SUM(urs.duration) total_reading_time - FROM article a - LEFT JOIN article_topic_map atm on a.id = atm.article_id - LEFT JOIN topic t on atm.topic_id = t.id - INNER JOIN user_reading_session urs ON urs.article_id = a.id - INNER JOIN language l on a.language_id = l.id - INNER JOIN user u ON urs.user_id = u.id - WHERE DATEDIFF(CURDATE(), urs.start_time) <= {self.DAYS_FOR_REPORT} - AND u.learned_language_id = a.language_id - GROUP BY a.language_id, atm.topic_id;""" - topic_reading_time_df = pd.read_sql(query, con=self.db_connection) - topic_reading_time_df["total_reading_time"] = topic_reading_time_df[ - "total_reading_time" - ].apply(ms_to_mins) - topic_reading_time_df.loc[topic_reading_time_df["Topic"].isna(), "Topic"] = ( - "Unclassified" - ) - return topic_reading_time_df - def get_new_topic_reading_time(self): print("Getting New Topic Reading Times...") query = f"""SELECT l.name as Language, t.title Topic, SUM(urs.duration) total_reading_time diff --git a/tools/report_generator/generate_report.py b/tools/report_generator/generate_report.py index cd8ebf68..9957c0a8 100644 --- a/tools/report_generator/generate_report.py +++ b/tools/report_generator/generate_report.py @@ -79,8 +79,12 @@ def save_fig_params(filename): def get_new_repeating_sents_table(pd_repeating_sents): return generate_html_table(pd_repeating_sents.sort_values("Count", ascending=False)) + def get_new_url_keywords_table(pd_url_keywords_count): - return generate_html_table(pd_url_keywords_count.sort_values("count", ascending=False)) + return generate_html_table( + pd_url_keywords_count.sort_values("count", ascending=False) + ) + def get_rejected_sentences_table(total_deleted_sents): total_deleted_sents["Total"] = sum(total_deleted_sents.values()) @@ -341,41 +345,6 @@ def generate_unique_articles_read_plot(user_reading_time_df, lang=""): return save_fig_params(filename) -def generate_topic_reading_time(topic_reading_time_df, lang=""): - filename = ( - f"topic_reading_time_plot_all_lang_{date_str}_d{DAYS_FOR_REPORT}.png" - if lang == "" - else f"topic_reading_time_plot_{lang}_{date_str}_d{DAYS_FOR_REPORT}.png" - ) - plot_total_reading_time = ( - topic_reading_time_df.groupby(["Language", "Topic"]) - .total_reading_time.sum() - .reset_index() - ) - if lang == "": - ax = plt.subplot(111) - sns.barplot( - x="Language", - y="total_reading_time", - hue="Topic", - data=plot_total_reading_time, - palette=get_color_palette(len(plot_total_reading_time["Topic"].unique())), - ) - set_legend_to_right_side(ax) - plt.title("Total Reading Time by Topic per Language") - else: - sns.barplot( - x="Topic", - y="total_reading_time", - hue="Topic", - data=plot_total_reading_time[plot_total_reading_time["Language"] == lang], - ) - plt.title(f"{lang} - Total Reading time by Topic") - plt.xticks(rotation=35, ha="right") - plt.ylabel("Total Reading time (mins)") - return save_fig_params(filename) - - def generate_new_topic_reading_time(topic_reading_time_df, lang=""): filename = ( f"new_topic_reading_time_plot_all_lang_{date_str}_d{DAYS_FOR_REPORT}.png" @@ -546,7 +515,6 @@ def generate_html_page(): feed_df = data_extractor.get_feed_df() article_df = data_extractor.get_article_df(feed_df) - article_topics_df = data_extractor.get_article_topics_df(feed_df) new_article_topics_df = data_extractor.get_article_new_topics_df(feed_df) language_df = data_extractor.get_language_df() bookmark_df = data_extractor.get_bookmark_df() @@ -560,7 +528,6 @@ def generate_html_page(): user_exercise_time_df, user_reading_time_df ) ) - topic_reading_time_df = data_extractor.get_topic_reading_time() new_topic_reading_time_df = data_extractor.get_new_topic_reading_time() total_unique_articles_opened_by_users = len( article_df[article_df.id.isin(user_reading_time_df.id)] @@ -589,7 +556,6 @@ def generate_html_page(): else f"WARNING! This date only contains values from the last '{total_days_from_crawl_report}' day(s)." ) ACTIVE_USER_ACTIVITY_TIME_MIN = 1 - articles_with_topic_count = len(article_topics_df.id.unique()) articles_with_new_topic_count = len(new_article_topics_df.id.unique()) active_users = combined_user_activity_df[ ( @@ -607,7 +573,6 @@ def generate_html_page(): lang_report += f"""

{lang}

Articles Downloaded

-

User Activity

@@ -615,7 +580,6 @@ def generate_html_page(): if lang in active_users["Language"].values: lang_report += f"""

Total Active users: {len(active_users[active_users["Language"] == lang])}

- @@ -655,9 +619,7 @@ def generate_html_page():

Total Articles Crawled: {len(article_df)}

Total Unique Articles Opened: {total_unique_articles_opened_by_users} -

Topic Coverage: {((articles_with_topic_count / len(article_df)) * 100) if len(article_df) > 0 else 0:.2f}%

New Topic Coverage: {((articles_with_new_topic_count / len(article_df)) * 100) if len(article_df) > 0 else 0:.2f}%

-

Possible Innactive feeds:

@@ -694,7 +656,6 @@ def generate_html_page(): {generate_top_opened_articles(user_reading_time_df, data_extractor, feed_df)} - """ From a7b61824c631658b791258fcdfd78fabe1510fe5 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Tue, 12 Nov 2024 14:38:50 +0100 Subject: [PATCH 49/71] Renaming new_[topics-related] - Dropping the new from the tables, as they are no longer relevant. --- .../set_new_topics_from_url_keyword.py | 4 +- tools/evaluate_infer_topics.py | 4 +- tools/extract_articles_with_new_topics.py | 4 +- .../migrations/24-11-12-rename_new_topics.sql | 78 +++++++++++++++++++ tools/report_generator/data_extractor.py | 16 ++-- tools/report_generator/generate_report.py | 30 +++---- tools/run_knn_topic_inferance.py | 3 +- tools/set_new_topics_article.py | 6 +- zeeguu/api/endpoints/article.py | 6 +- zeeguu/api/endpoints/topics.py | 58 +++++++------- .../elastic_recommender.py | 40 +++++----- .../content_retriever/article_downloader.py | 22 +++--- zeeguu/core/elastic/elastic_query_builder.py | 61 ++++----------- zeeguu/core/elastic/indexing.py | 4 +- zeeguu/core/model/article.py | 28 +++---- zeeguu/core/model/article_topic_map.py | 8 +- .../core/model/article_topic_user_feedback.py | 18 +++-- zeeguu/core/model/topic.py | 4 +- zeeguu/core/model/topic_filter.py | 20 ++--- zeeguu/core/model/topic_subscription.py | 20 ++--- zeeguu/core/model/url_keyword.py | 14 ++-- zeeguu/core/model/user_article.py | 18 ++--- 22 files changed, 251 insertions(+), 215 deletions(-) create mode 100644 tools/migrations/24-11-12-rename_new_topics.sql diff --git a/tools/es_v8_migration/set_new_topics_from_url_keyword.py b/tools/es_v8_migration/set_new_topics_from_url_keyword.py index 73ae1af1..647b882b 100644 --- a/tools/es_v8_migration/set_new_topics_from_url_keyword.py +++ b/tools/es_v8_migration/set_new_topics_from_url_keyword.py @@ -30,8 +30,8 @@ .join(ArticleUrlKeywordMap) .join(UrlKeyword) .join(ArticleTopicMap, isouter=True) - .filter(UrlKeyword.new_topic != None) - .filter(ArticleTopicMap.new_topic_id == None) + .filter(UrlKeyword.topic != None) + .filter(ArticleTopicMap.topic_id == None) .all() ) print("Adding topics based on url keywords to articles...") diff --git a/tools/evaluate_infer_topics.py b/tools/evaluate_infer_topics.py index f67f7ca6..4d655ee8 100644 --- a/tools/evaluate_infer_topics.py +++ b/tools/evaluate_infer_topics.py @@ -50,7 +50,7 @@ article_to_search, k_to_use ) - neighbouring_topics = [t.new_topic for a in a_found_t for t in a.new_topics] + neighbouring_topics = [t.topic for a in a_found_t for t in a.topic] neighbouring_keywords = [t.url_keyword for a in a_found_t for t in a.url_keywords] avg_score = sum([float(h["_score"]) for h in hits_t]) / len(hits_t) @@ -62,7 +62,7 @@ print("Keyword Counts") pprint(topics_key_counter) print() - og_topics = " ".join([str(t.new_topic.title) for t in article_to_search.new_topics]) + og_topics = " ".join([str(t.topic.title) for t in article_to_search.topics]) try: top_topic, count = topics_counter.most_common(1)[0] threshold = ( diff --git a/tools/extract_articles_with_new_topics.py b/tools/extract_articles_with_new_topics.py index 21a80399..b65cd1fd 100644 --- a/tools/extract_articles_with_new_topics.py +++ b/tools/extract_articles_with_new_topics.py @@ -27,8 +27,8 @@ tuple = ( a.id, a.content, - a.new_topics_as_string(), - a.new_topics[-1].new_topic.title, + a.topic(), + a.topics[-1].topic.title, ) articles_to_extract.append(tuple) diff --git a/tools/migrations/24-11-12-rename_new_topics.sql b/tools/migrations/24-11-12-rename_new_topics.sql new file mode 100644 index 00000000..c3138b54 --- /dev/null +++ b/tools/migrations/24-11-12-rename_new_topics.sql @@ -0,0 +1,78 @@ +/* + Renaming the new topics tables, to not have new. + */ +ALTER TABLE + `zeeguu_test`.`new_topic` RENAME TO `zeeguu_test`.`topic`; + +/* new_topic_filter -> topic_filter */ +ALTER TABLE + `zeeguu_test`.`new_topic_filter` DROP FOREIGN KEY `new_topic_filter_ibfk_2`; + +ALTER TABLE + `zeeguu_test`.`new_topic_filter` CHANGE COLUMN `new_topic_id` `topic_id` INT NULL DEFAULT NULL, + RENAME TO `zeeguu_test`.`topic_filter`; + +ALTER TABLE + `zeeguu_test`.`new_topic_filter` +ADD + CONSTRAINT `topic_filter_ibfk_2` FOREIGN KEY (`topic_id`) REFERENCES `zeeguu_test`.`topic` (`id`); + +ALTER TABLE + `zeeguu_test`.`new_topic_filter` RENAME TO `zeeguu_test`.`topic_filter`; + +/* new_topic_subscription -> topic_subscription */ +ALTER TABLE + `zeeguu_test`.`new_topic_subscription` DROP FOREIGN KEY `new_topic_subscription_ibfk_2`; + +ALTER TABLE + `zeeguu_test`.`new_topic_subscription` CHANGE COLUMN `new_topic_id` `topic_id` INT NULL DEFAULT NULL; + +ALTER TABLE + `zeeguu_test`.`new_topic_subscription` +ADD + CONSTRAINT `topic_subscription_ibfk_2` FOREIGN KEY (`topic_id`) REFERENCES `zeeguu_test`.`topic` (`id`); + +ALTER TABLE + `zeeguu_test`.`new_topic_subscription` RENAME TO `zeeguu_test`.`topic_subscription`; + +/* new_topic_user_feedback -> topic_user_feedback */ +ALTER TABLE + `zeeguu_test`.`new_topic_user_feedback` DROP FOREIGN KEY `new_topic_user_feedback_ibfk_3`; + +ALTER TABLE + `zeeguu_test`.`new_topic_user_feedback` CHANGE COLUMN `new_topic_id` `topic_id` INT NULL DEFAULT NULL; + +ALTER TABLE + `zeeguu_test`.`new_topic_user_feedback` +ADD + CONSTRAINT `topic_user_feedback_ibfk_3` FOREIGN KEY (`topic_id`) REFERENCES `zeeguu_test`.`topic` (`id`); + +ALTER TABLE + `zeeguu_test`.`new_topic_user_feedback` RENAME TO `zeeguu_test`.`topic_user_feedback`; + +/* new_article_topic_map -> article_topic_map */ +ALTER TABLE + `zeeguu_test`.`new_article_topic_map` DROP FOREIGN KEY `new_article_topic_map_ibfk_2`; + +ALTER TABLE + `zeeguu_test`.`new_article_topic_map` CHANGE COLUMN `new_topic_id` `topic_id` INT NOT NULL; + +ALTER TABLE + `zeeguu_test`.`new_article_topic_map` +ADD + CONSTRAINT `article_topic_map_ibfk_2` FOREIGN KEY (`topic_id`) REFERENCES `zeeguu_test`.`topic` (`id`); + +ALTER TABLE + `zeeguu_test`.`new_article_topic_map` RENAME TO `zeeguu_test`.`article_topic_map`; + +/* url_keyword, update reference column name */ +ALTER TABLE + `zeeguu_test`.`url_keyword` DROP FOREIGN KEY `url_keyword_ibfk_2`; + +ALTER TABLE + `zeeguu_test`.`url_keyword` CHANGE COLUMN `new_topic_id` `topic_id` INT NULL DEFAULT NULL; + +ALTER TABLE + `zeeguu_test`.`url_keyword` +ADD + CONSTRAINT `url_keyword_ibfk_2` FOREIGN KEY (`topic_id`) REFERENCES `zeeguu_test`.`topic` (`id`); \ No newline at end of file diff --git a/tools/report_generator/data_extractor.py b/tools/report_generator/data_extractor.py index 5a34dda5..be11e7e4 100644 --- a/tools/report_generator/data_extractor.py +++ b/tools/report_generator/data_extractor.py @@ -23,12 +23,12 @@ def run_query(self, query): df = pd.read_sql(query, con=self.db_connection) return df - def get_article_new_topics_df(self, feed_df): + def get_article_topics_df(self, feed_df): print("Getting Article New Topics...") query = f"""SELECT a.id, l.name Language, a.feed_id, t.title Topic, atm.origin_type FROM article a - INNER JOIN new_article_topic_map atm on a.id = atm.article_id - INNER JOIN new_topic t ON atm.new_topic_id = t.id + INNER JOIN article_topic_map atm on a.id = atm.article_id + INNER JOIN topic t ON atm.topic_id = t.id INNER JOIN language l ON l.id = a.language_id WHERE DATEDIFF(CURDATE(), a.published_time) <= {self.DAYS_FOR_REPORT} AND a.broken = 0""" @@ -81,7 +81,7 @@ def get_url_keyword_counts(self, min_count=100): ON uk.id = keyword_count.url_keyword_id JOIN language l ON l.id = language_id WHERE count > {min_count} - AND new_topic_id is NULL + AND topic_id is NULL AND keyword not in ( "news", "i", @@ -260,18 +260,18 @@ def get_combined_user_reading_exercise_activity( ] return active_users_reading_or_exercises - def get_new_topic_reading_time(self): + def get_topic_reading_time(self): print("Getting New Topic Reading Times...") query = f"""SELECT l.name as Language, t.title Topic, SUM(urs.duration) total_reading_time FROM article a - LEFT JOIN new_article_topic_map atm on a.id = atm.article_id - LEFT JOIN new_topic t on atm.new_topic_id = t.id + LEFT JOIN article_topic_map atm on a.id = atm.article_id + LEFT JOIN topic t on atm.topic_id = t.id INNER JOIN user_reading_session urs ON urs.article_id = a.id INNER JOIN language l on a.language_id = l.id INNER JOIN user u ON urs.user_id = u.id WHERE DATEDIFF(CURDATE(), urs.start_time) <= {self.DAYS_FOR_REPORT} AND u.learned_language_id = a.language_id - GROUP BY a.language_id, atm.new_topic_id;""" + GROUP BY a.language_id, atm.topic_id;""" topic_reading_time_df = pd.read_sql(query, con=self.db_connection) topic_reading_time_df["total_reading_time"] = topic_reading_time_df[ "total_reading_time" diff --git a/tools/report_generator/generate_report.py b/tools/report_generator/generate_report.py index 9957c0a8..bf939fe9 100644 --- a/tools/report_generator/generate_report.py +++ b/tools/report_generator/generate_report.py @@ -80,7 +80,7 @@ def get_new_repeating_sents_table(pd_repeating_sents): return generate_html_table(pd_repeating_sents.sort_values("Count", ascending=False)) -def get_new_url_keywords_table(pd_url_keywords_count): +def get_url_keywords_table(pd_url_keywords_count): return generate_html_table( pd_url_keywords_count.sort_values("count", ascending=False) ) @@ -164,10 +164,10 @@ def generate_topic_by_feed_plot(article_topic_df, lang): return save_fig_params(filename) -def generate_new_topic_by_feed_plot(article_topic_df, lang): +def generate_topic_by_feed_plot(article_topic_df, lang): # If I want to make topics consistant # https://stackoverflow.com/questions/39000115/how-can-i-set-the-colors-per-value-when-coloring-plots-by-a-dataframe-column - filename = f"new_topics_per_feed_lang_{lang}_{date_str}_d{DAYS_FOR_REPORT}.png" + filename = f"topics_per_feed_lang_{lang}_{date_str}_d{DAYS_FOR_REPORT}.png" topic_monitor = ( article_topic_df.groupby(["Language", "Feed Name"]) .Topic.value_counts() @@ -216,8 +216,8 @@ def generate_topic_coverage_plot(article_df, article_with_topics_df): return save_fig_params(filename) -def generate_new_topic_coverage_plot(article_df, article_with_topics_df): - filename = f"new_topic_coverage_plot_{date_str}_d{DAYS_FOR_REPORT}.png" +def generate_topic_coverage_plot(article_df, article_with_topics_df): + filename = f"topic_coverage_plot_{date_str}_d{DAYS_FOR_REPORT}.png" article_df["has_topic"] = "No" article_df.loc[article_df.id.isin(article_with_topics_df.id), "has_topic"] = "Yes" articles_with_topics = ( @@ -345,11 +345,11 @@ def generate_unique_articles_read_plot(user_reading_time_df, lang=""): return save_fig_params(filename) -def generate_new_topic_reading_time(topic_reading_time_df, lang=""): +def generate_topic_reading_time(topic_reading_time_df, lang=""): filename = ( - f"new_topic_reading_time_plot_all_lang_{date_str}_d{DAYS_FOR_REPORT}.png" + f"topic_reading_time_plot_all_lang_{date_str}_d{DAYS_FOR_REPORT}.png" if lang == "" - else f"new_topic_reading_time_plot_{lang}_{date_str}_d{DAYS_FOR_REPORT}.png" + else f"topic_reading_time_plot_{lang}_{date_str}_d{DAYS_FOR_REPORT}.png" ) plot_total_reading_time = ( topic_reading_time_df.groupby(["Language", "Topic"]) @@ -515,7 +515,7 @@ def generate_html_page(): feed_df = data_extractor.get_feed_df() article_df = data_extractor.get_article_df(feed_df) - new_article_topics_df = data_extractor.get_article_new_topics_df(feed_df) + new_article_topics_df = data_extractor.get_article_topics_df(feed_df) language_df = data_extractor.get_language_df() bookmark_df = data_extractor.get_bookmark_df() data_extractor.add_stats_to_feed(feed_df, article_df) @@ -528,7 +528,7 @@ def generate_html_page(): user_exercise_time_df, user_reading_time_df ) ) - new_topic_reading_time_df = data_extractor.get_new_topic_reading_time() + new_topic_reading_time_df = data_extractor.get_topic_reading_time() total_unique_articles_opened_by_users = len( article_df[article_df.id.isin(user_reading_time_df.id)] ) @@ -573,14 +573,14 @@ def generate_html_page(): lang_report += f"""

{lang}

Articles Downloaded

- +

User Activity

""" if lang in active_users["Language"].values: lang_report += f"""

Total Active users: {len(active_users[active_users["Language"] == lang])}

- + @@ -620,7 +620,7 @@ def generate_html_page():

Total Articles Crawled: {len(article_df)}

Total Unique Articles Opened: {total_unique_articles_opened_by_users}

New Topic Coverage: {((articles_with_new_topic_count / len(article_df)) * 100) if len(article_df) > 0 else 0:.2f}%

- +

Possible Innactive feeds:

Full table

@@ -656,7 +656,7 @@ def generate_html_page(): {generate_top_opened_articles(user_reading_time_df, data_extractor, feed_df)} - + """ result += f""" @@ -679,7 +679,7 @@ def generate_html_page(): result += f"""

Newly url keywords without topics:

URL Keywords that occur more than 100 times in articles and are not mapped to a topic. They are language unique.

- {get_new_url_keywords_table(pd_new_url_keywords) if DAYS_FOR_REPORT <= 7 else "

Skipped due to long period.

"} + {get_url_keywords_table(pd_new_url_keywords) if DAYS_FOR_REPORT <= 7 else "

Skipped due to long period.

"}

Feed activity:

{generate_html_table(pd_feed_innactivity_time)} diff --git a/tools/run_knn_topic_inferance.py b/tools/run_knn_topic_inferance.py index bbe3bf20..2098e216 100644 --- a/tools/run_knn_topic_inferance.py +++ b/tools/run_knn_topic_inferance.py @@ -10,7 +10,6 @@ from zeeguu.core.elastic.settings import ES_CONN_STRING, ES_ZINDEX from elasticsearch import Elasticsearch from collections import Counter -from zeeguu.core.elastic.elastic_query_builder import build_elastic_recommender_query from zeeguu.api.app import create_app import argparse @@ -79,7 +78,7 @@ def search_similar_to_article(article_id): hit["_source"].get("url", ""), hit["_score"], ) - neighbouring_topics = [t.new_topic for a in a_found_t for t in a.new_topics] + neighbouring_topics = [t.topic for a in a_found_t for t in a.topics] TOPICS_TO_NOT_COUNT = set(["news", "aktuell", "nyheder", "nieuws", "article"]) neighbouring_keywords = [ t.url_keywords diff --git a/tools/set_new_topics_article.py b/tools/set_new_topics_article.py index c45e6170..8d35cbba 100644 --- a/tools/set_new_topics_article.py +++ b/tools/set_new_topics_article.py @@ -2,7 +2,7 @@ import zeeguu.core from zeeguu.api.app import create_app -from zeeguu.core.content_retriever.article_downloader import add_new_topics +from zeeguu.core.content_retriever.article_downloader import add_topics from zeeguu.core.model import Article from tqdm import tqdm @@ -33,10 +33,10 @@ if article is None: print("Skipping null article") continue - if len(article.new_topics) > 0: + if len(article.topics) > 0: print("This article already has topics!") continue - add_new_topics( + add_topics( article, article.feed, [auk.url_keyword for auk in article.url_keywords], diff --git a/zeeguu/api/endpoints/article.py b/zeeguu/api/endpoints/article.py index 1e9f28af..e746420d 100644 --- a/zeeguu/api/endpoints/article.py +++ b/zeeguu/api/endpoints/article.py @@ -124,15 +124,15 @@ def remove_ml_suggestion(): """ user = User.find_by_id(flask.g.user_id) article_id = request.form.get("article_id", "") - new_topic = request.form.get("new_topic", "") + topic = request.form.get("topic", "") article = Article.find_by_id(article_id) - new_topic = Topic.find(new_topic) + topic = Topic.find(topic) try: ArticleTopicUserFeedback.find_or_create( db_session, article, user, - new_topic, + topic, ArticleTopicUserFeedback.DO_NOT_SHOW_FEEDBACK, ) return "OK" diff --git a/zeeguu/api/endpoints/topics.py b/zeeguu/api/endpoints/topics.py index f2bcdcc7..3ace93e7 100644 --- a/zeeguu/api/endpoints/topics.py +++ b/zeeguu/api/endpoints/topics.py @@ -17,29 +17,29 @@ db_session = zeeguu.core.model.db.session -SUBSCRIBE_NEW_TOPIC = "subscribe_new_topic" -UNSUBSCRIBE_NEW_TOPIC = "unsubscribe_new_topic" -SUBSCRIBED_NEW_TOPICS = "subscribed_new_topics" -FILTER_NEW_TOPIC = "filter_new_topic" -UNFILTER_NEW_TOPIC = "unfilter_new_topic" -FILTERED_NEW_TOPICS = "filtered_new_topics" +SUBSCRIBE_TOPIC = "subscribe_topic" +UNSUBSCRIBE_TOPIC = "unsubscribe_topic" +SUBSCRIBED_TOPICS = "subscribed_topics" +FILTER_TOPIC = "filter_topic" +UNFILTER_TOPIC = "unfilter_topic" +FILTERED_TOPICS = "filtered_topics" # --------------------------------------------------------------------------- -@api.route(f"/{SUBSCRIBE_NEW_TOPIC}", methods=("POST",)) +@api.route(f"/{SUBSCRIBE_TOPIC}", methods=("POST",)) # --------------------------------------------------------------------------- @cross_domain @requires_session -def subscribe_to_new_topic_with_id(): +def subscribe_to_topic_with_id(): """ - :param: new_topic_id -- the id of the topic to be subscribed to. + :param: topic_id -- the id of the topic to be subscribed to. Subscribe to the topic with the given id :return: "OK" in case of success """ - new_topic_id = int(request.form.get("new_topic_id", "")) + topic_id = int(request.form.get("topic_id", "")) - topic_object = Topic.find_by_id(new_topic_id) + topic_object = Topic.find_by_id(topic_id) user = User.find_by_id(flask.g.user_id) TopicSubscription.find_or_create(db_session, user, topic_object) db_session.commit() @@ -47,21 +47,21 @@ def subscribe_to_new_topic_with_id(): # --------------------------------------------------------------------------- -@api.route(f"/{UNSUBSCRIBE_NEW_TOPIC}", methods=("POST",)) +@api.route(f"/{UNSUBSCRIBE_TOPIC}", methods=("POST",)) # --------------------------------------------------------------------------- @cross_domain @requires_session -def unsubscribe_from_new_topic(): +def unsubscribe_from_topic(): """ A user can unsubscribe from the topic with a given ID :return: "OK" in case of success """ - new_topic_id = int(request.form.get("new_topic_id", "")) + topic_id = int(request.form.get("topic_id", "")) user = User.find_by_id(flask.g.user_id) try: - to_delete = TopicSubscription.with_topic_id(new_topic_id, user) + to_delete = TopicSubscription.with_topic_id(topic_id, user) db_session.delete(to_delete) db_session.commit() except Exception as e: @@ -74,11 +74,11 @@ def unsubscribe_from_new_topic(): # --------------------------------------------------------------------------- -@api.route(f"/{SUBSCRIBED_NEW_TOPICS}", methods=("GET",)) +@api.route(f"/{SUBSCRIBED_TOPICS}", methods=("GET",)) # --------------------------------------------------------------------------- @cross_domain @requires_session -def get_subscribed_new_topics(): +def get_subscribed_topics(): """ A user might be subscribed to multiple topics at once. This endpoint returns them as a list. @@ -93,7 +93,7 @@ def get_subscribed_new_topics(): topic_list = [] for sub in subscriptions: try: - topic_list.append(sub.new_topic.as_dictionary()) + topic_list.append(sub.topic.as_dictionary()) except Exception as e: from sentry_sdk import capture_exception @@ -104,11 +104,11 @@ def get_subscribed_new_topics(): # --------------------------------------------------------------------------- -@api.route("/available_new_topics", methods=("GET",)) +@api.route("/available_topics", methods=("GET",)) # --------------------------------------------------------------------------- @cross_domain @requires_session -def get_available_new_topics(): +def get_available_topics(): """ Get a list of interesting topics for the given language. Interesting topics are for now defined as: @@ -121,7 +121,7 @@ def get_available_new_topics(): topic_data = [] user = User.find_by_id(flask.g.user_id) already_subscribed = [ - each.new_topic.id for each in TopicSubscription.all_for_user(user) + each.topic.id for each in TopicSubscription.all_for_user(user) ] user_learning_language = Language.find_by_id(user.learned_language_id) topics = Topic.get_all_topics(user_learning_language) @@ -134,11 +134,11 @@ def get_available_new_topics(): # --------------------------------------------------------------------------- -@api.route(f"/{FILTER_NEW_TOPIC}", methods=("POST",)) +@api.route(f"/{FILTER_TOPIC}", methods=("POST",)) # --------------------------------------------------------------------------- @cross_domain @requires_session -def subscribe_to_new_filter_with_id(): +def subscribe_to_filter_with_id(): """ :param: filter_id -- the id of the filter to be subscribed to. Subscribe to the filter with the given id @@ -156,17 +156,17 @@ def subscribe_to_new_filter_with_id(): # --------------------------------------------------------------------------- -@api.route(f"/{UNFILTER_NEW_TOPIC}", methods=("POST",)) +@api.route(f"/{UNFILTER_TOPIC}", methods=("POST",)) # --------------------------------------------------------------------------- @cross_domain @requires_session -def unsubscribe_from_new_filter(): +def unsubscribe_from_filter(): """ A user can unsubscribe from the filter with a given ID :return: OK / ERROR """ user = User.find_by_id(flask.g.user_id) - filter_id = int(request.form.get("new_topic_id", "")) + filter_id = int(request.form.get("topic_id", "")) try: to_delete = TopicFilter.with_topic_id(filter_id, user) @@ -182,11 +182,11 @@ def unsubscribe_from_new_filter(): # --------------------------------------------------------------------------- -@api.route(f"/{FILTERED_NEW_TOPICS}", methods=("GET",)) +@api.route(f"/{FILTERED_TOPICS}", methods=("GET",)) # --------------------------------------------------------------------------- @cross_domain @requires_session -def get_subscribed_new_filters(): +def get_subscribed_filters(): """ A user might be subscribed to multiple filters at once. This endpoint returns them as a list. @@ -201,7 +201,7 @@ def get_subscribed_new_filters(): filter_list = [] for fil in filters: try: - filter_list.append(fil.new_topic.as_dictionary()) + filter_list.append(fil.topic.as_dictionary()) except Exception as e: from sentry_sdk import capture_exception diff --git a/zeeguu/core/content_recommender/elastic_recommender.py b/zeeguu/core/content_recommender/elastic_recommender.py index 3812f7cb..08fb4588 100644 --- a/zeeguu/core/content_recommender/elastic_recommender.py +++ b/zeeguu/core/content_recommender/elastic_recommender.py @@ -10,7 +10,6 @@ from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, Q, SF from pprint import pprint -from zeeguu.api.endpoints.feature_toggles import _new_topics from zeeguu.core.model import ( Article, @@ -53,21 +52,21 @@ def _prepare_user_constraints(user): # 2. New Topics to exclude / filter out # ================================= - excluded_new_topics = TopicFilter.all_for_user(user) - new_topics_to_exclude = [ - each.new_topic.title for each in excluded_new_topics if each is not None + excluded_topics = TopicFilter.all_for_user(user) + topics_to_exclude = [ + each.topic.title for each in excluded_topics if each is not None ] - print(f"New Topics to exclude: {excluded_new_topics}") + print(f"Topics to exclude: {excluded_topics}") # 3. New Topics subscribed, and thus to include # ========================================= - topic_new_subscriptions = TopicSubscription.all_for_user(user) - new_topics_to_include = [ - subscription.new_topic.title - for subscription in topic_new_subscriptions + topic_subscriptions = TopicSubscription.all_for_user(user) + topics_to_include = [ + subscription.topic.title + for subscription in topic_subscriptions if subscription is not None ] - print(f"New Topics to include: {topic_new_subscriptions}") + print(f"New Topics to include: {topic_subscriptions}") # 6. Wanted user topics # ========================================= @@ -82,8 +81,8 @@ def _prepare_user_constraints(user): language, upper_bounds, lower_bounds, - _new_topics_to_string(new_topics_to_include), - _new_topics_to_string(new_topics_to_exclude), + _topics_to_string(topics_to_include), + _topics_to_string(topics_to_exclude), _list_to_string(wanted_user_searches), _list_to_string(unwanted_user_searches), ) @@ -121,8 +120,8 @@ def article_recommendations_for_user( language, upper_bounds, lower_bounds, - new_topics_to_include, - new_topics_to_exclude, + topics_to_include, + topics_to_exclude, wanted_user_searches, unwanted_user_searches, ) = _prepare_user_constraints(user) @@ -141,10 +140,9 @@ def article_recommendations_for_user( es_scale, es_offset, es_decay, - new_topics_to_include=new_topics_to_include, - new_topics_to_exclude=new_topics_to_exclude, + topics_to_include=topics_to_include, + topics_to_exclude=topics_to_exclude, page=page, - user_using_new_topics=_new_topics(user), is_es_v7=es_version == 7, ) @@ -197,10 +195,8 @@ def article_search_for_user( lower_bounds, topics_to_include, topics_to_exclude, - new_topics_to_include, - new_topics_to_exclude, - wanted_user_topics, - unwanted_user_topics, + wanted_user_searches, + unwanted_user_searches, ) = _prepare_user_constraints(user) # build the query using elastic_query_builder @@ -292,7 +288,7 @@ def _list_to_string(input_list): return " ".join([each for each in input_list]) or "" -def _new_topics_to_string(input_list): +def _topics_to_string(input_list): return ",".join(input_list) diff --git a/zeeguu/core/content_retriever/article_downloader.py b/zeeguu/core/content_retriever/article_downloader.py index 1c8a703b..0d34db8c 100644 --- a/zeeguu/core/content_retriever/article_downloader.py +++ b/zeeguu/core/content_retriever/article_downloader.py @@ -334,13 +334,13 @@ def download_feed_item(session, feed, feed_item, url, crawl_report): url_keywords = add_url_keywords(new_article, session) logp(f"Topic Keywords: ({url_keywords})") if SEMANTIC_SEARCH_AVAILABLE: - _, topics = add_new_topics(new_article, feed, url_keywords, session) + _, topics = add_topics(new_article, feed, url_keywords, session) logp(f"New Topics ({topics})") session.add(new_article) return new_article -def add_new_topics(new_article, feed, url_keywords, session): +def add_topics(new_article, feed, url_keywords, session): HARDCODED_FEEDS = { 102: 8, # The Onion EN 121: 8, # Lercio IT @@ -349,7 +349,7 @@ def add_new_topics(new_article, feed, url_keywords, session): if feed.id in HARDCODED_FEEDS: print("Used HARDCODED feed") topic = Topic.find_by_id(HARDCODED_FEEDS[feed.id]) - new_article.add_new_topic(topic, session, TopicOriginType.HARDSET.value) + new_article.add_topic(topic, session, TopicOriginType.HARDSET.value) session.add(new_article) return TopicOriginType.HARDSET.value, [topic.title] @@ -357,7 +357,7 @@ def add_new_topics(new_article, feed, url_keywords, session): topics = [] topics_added = set() for topic_key in url_keywords: - topic = topic_key.new_topic + topic = topic_key.topic print(topic_key, topic) if ( topic is not None @@ -366,7 +366,7 @@ def add_new_topics(new_article, feed, url_keywords, session): continue topics_added.add(topic.id) topics.append(topic) - new_article.add_new_topic(topic, session, TopicOriginType.URL_PARSED.value) + new_article.add_topic(topic, session, TopicOriginType.URL_PARSED.value) if len(topics) > 0: print("Used URL PARSED") @@ -374,14 +374,14 @@ def add_new_topics(new_article, feed, url_keywords, session): # If we have only one topic and that is News, we will try to infer. if not (len(topics) == 1 and 9 in topics_added): return TopicOriginType.URL_PARSED.value, [ - t.new_topic.title for t in new_article.new_topics + t.topic.title for t in new_article.topics ] from collections import Counter # Add based on KK neighbours: found_articles, _ = add_topics_based_on_semantic_hood_search(new_article) - neighbouring_topics = [t.new_topic for a in found_articles for t in a.new_topics] + neighbouring_topics = [t.topic for a in found_articles for t in a.topics] if len(neighbouring_topics) > 0: from pprint import pprint @@ -393,12 +393,10 @@ def add_new_topics(new_article, feed, url_keywords, session): ) # The threshold is being at least half or above rounded down if count >= threshold: print(f"Used INFERRED: {top_topic}, {count}, with t={threshold}") - new_article.add_new_topic( - top_topic, session, TopicOriginType.INFERRED.value - ) + new_article.add_topic(top_topic, session, TopicOriginType.INFERRED.value) session.add(new_article) return TopicOriginType.INFERRED.value, [ - t.new_topic.title for t in new_article.new_topics + t.topic.title for t in new_article.topics ] return ( @@ -406,7 +404,7 @@ def add_new_topics(new_article, feed, url_keywords, session): if len(topics) == 0 else ( TopicOriginType.URL_PARSED.value, - [t.new_topic.title for t in new_article.new_topics], + [t.topic.title for t in new_article.topics], ) ) diff --git a/zeeguu/core/elastic/elastic_query_builder.py b/zeeguu/core/elastic/elastic_query_builder.py index 9ffba174..c1afabb9 100644 --- a/zeeguu/core/elastic/elastic_query_builder.py +++ b/zeeguu/core/elastic/elastic_query_builder.py @@ -21,7 +21,7 @@ def array_of_lowercase_topics(topics): return [topic.lower() for topic in topics.split()] -def array_of_new_topics(topics): +def array_of_topics(topics): return topics.split(",") if topics != "" else [] @@ -51,11 +51,9 @@ def build_elastic_recommender_query( es_scale, es_offset, es_decay, - new_topics_to_include, - new_topics_to_exclude, + topics_to_include, + topics_to_exclude, page, - user_using_new_topics, - is_es_v7, ): """ @@ -107,33 +105,13 @@ def build_elastic_recommender_query( if not user_topics: user_topics = "" - # if user_topics: - # search_string = user_topics - # should.append(match("content", search_string)) - # should.append(match("title", search_string)) - - # Assumes if a user has new topics they won't be rolled - # back to not have new topics again. - # Essentially, if there are new topics the user won't be able - # to access the old topics anymore. - if user_using_new_topics and not is_es_v7: - topics_to_filter_out = array_of_new_topics(new_topics_to_exclude) - if len(new_topics_to_exclude) > 0: - should_remove_topics = [] - for t in topics_to_filter_out: - should_remove_topics.append({"match": {"topics": t}}) - should_remove_topics.append({"match": {"topics_inferred": t}}) - must_not.append({"bool": {"should": should_remove_topics}}) - else: - unwanted_old_topics_arr = array_of_lowercase_topics(unwanted_topics) - if len(unwanted_old_topics_arr) > 0: - must_not.append( - { - "terms": { - "topics" if is_es_v7 else "old_topics": unwanted_old_topics_arr - } - } - ) + topics_to_filter_out = array_of_topics(topics_to_exclude) + if len(topics_to_exclude) > 0: + should_remove_topics = [] + for t in topics_to_filter_out: + should_remove_topics.append({"match": {"topics": t}}) + should_remove_topics.append({"match": {"topics_inferred": t}}) + must_not.append({"bool": {"should": should_remove_topics}}) if unwanted_user_topics: must_not.append(match("content", unwanted_user_topics)) @@ -141,18 +119,13 @@ def build_elastic_recommender_query( must.append(exists("published_time")) - if user_using_new_topics and not is_es_v7: - topics_to_find = array_of_new_topics(new_topics_to_include) - if len(topics_to_find) > 0: - should_topics = [] - for t in topics_to_find: - should_topics.append({"match": {"topics": t}}) - should_topics.append({"match": {"topics_inferred": t}}) - must.append({"bool": {"should": should_topics}}) - else: - topics_arr = array_of_lowercase_topics(topics) - if len(topics_arr) > 0: - must.append({"terms": {"topics" if is_es_v7 else "old_topics": topics_arr}}) + topics_to_find = array_of_topics(topics_to_include) + if len(topics_to_find) > 0: + should_topics = [] + for t in topics_to_find: + should_topics.append({"match": {"topics": t}}) + should_topics.append({"match": {"topics_inferred": t}}) + must.append({"bool": {"should": should_topics}}) bool_query_body["query"]["bool"].update({"must": must}) bool_query_body["query"]["bool"].update({"must_not": must_not}) diff --git a/zeeguu/core/elastic/indexing.py b/zeeguu/core/elastic/indexing.py index 5c6710ab..1699774e 100644 --- a/zeeguu/core/elastic/indexing.py +++ b/zeeguu/core/elastic/indexing.py @@ -7,7 +7,7 @@ from zeeguu.core.semantic_vector_api import get_embedding_from_article -def find_new_topics(article_id, session): +def find_topics(article_id, session): article_topics = ( session.query(Topic) .join(ArticleTopicMap) @@ -40,7 +40,7 @@ def find_filter_url_keywords(article_id, session): def document_from_article(article, session): - topics, topics_inferred = find_new_topics(article.id, session) + topics, topics_inferred = find_topics(article.id, session) doc = { "title": article.title, "author": article.authors, diff --git a/zeeguu/core/model/article.py b/zeeguu/core/model/article.py index bd0b7981..8d7c77c4 100644 --- a/zeeguu/core/model/article.py +++ b/zeeguu/core/model/article.py @@ -82,7 +82,7 @@ class Article(db.Model): uploader_id = Column(Integer, ForeignKey(User.id)) uploader = relationship(User) - new_topics = relationship("ArticleTopicMap", back_populates="article") + topics = relationship("ArticleTopicMap", back_populates="article") url_keywords = relationship("ArticleUrlKeywordMap", back_populates="article") # Few words in an article is very often not an @@ -151,18 +151,18 @@ def vote_broken(self): # somebody could vote that this article is broken self.broken += 1 - def new_topics_as_string(self): + def topics_as_string(self): topics = "" - for topic in self.new_topics: - topics += topic.new_topic.title + ", " + for topic in self.topics: + topics += topic.topic.title + ", " return topics - def new_topics_as_tuple(self): + def topics_as_tuple(self): topics = [] - for topic in self.new_topics: - if topic.new_topic.title == "" or topic.new_topic.title is None: + for topic in self.topics: + if topic.topic.title == "" or topic.topic.title is None: continue - topics.append((topic.new_topic.title, topic.origin_type)) + topics.append((topic.topic.title, topic.origin_type)) return topics def contains_any_of(self, keywords: list): @@ -224,8 +224,8 @@ def fk_to_cefr(fk_difficulty): title=self.title, summary=summary, language=self.language.code, - new_topics=self.new_topics_as_string(), - new_topics_list=self.new_topics_as_tuple(), + topics=self.topics_as_string(), + topics_list=self.topics_as_tuple(), video=self.video, metrics=dict( difficulty=self.fk_difficulty / 100, @@ -278,15 +278,15 @@ def article_info_for_teacher(self): def is_owned_by(self, user): return self.uploader_id == user.id - def add_new_topic(self, new_topic, session, origin_type: TopicOriginType): + def add_topic(self, topic, session, origin_type: TopicOriginType): - t = ArticleTopicMap(article=self, new_topic=new_topic, origin_type=origin_type) + t = ArticleTopicMap(article=self, topic=topic, origin_type=origin_type) session.add(t) - def set_new_topics(self, topics, session): + def set_topics(self, topics, session): for t in topics: - self.add_new_topic(t, session, TopicOriginType.URL_PARSED.value) + self.add_topic(t, session, TopicOriginType.URL_PARSED.value) def add_url_keyword(self, url_keyword, rank, session): diff --git a/zeeguu/core/model/article_topic_map.py b/zeeguu/core/model/article_topic_map.py index 45d8e037..12dd315c 100644 --- a/zeeguu/core/model/article_topic_map.py +++ b/zeeguu/core/model/article_topic_map.py @@ -11,10 +11,10 @@ class TopicOriginType(IntEnum): class ArticleTopicMap(db.Model): - __tablename__ = "new_article_topic_map" + __tablename__ = "article_topic_map" # Constants used for origin_type article_id = Column(ForeignKey("article.id"), primary_key=True) - new_topic_id = Column(ForeignKey("new_topic.id"), primary_key=True) + topic_id = Column(ForeignKey("topic.id"), primary_key=True) origin_type = Column(Integer) - article = relationship("Article", back_populates="new_topics") - new_topic = relationship("Topic", back_populates="articles") + article = relationship("Article", back_populates="topics") + topic = relationship("Topic", back_populates="articles") diff --git a/zeeguu/core/model/article_topic_user_feedback.py b/zeeguu/core/model/article_topic_user_feedback.py index c2affaaf..b7372d97 100644 --- a/zeeguu/core/model/article_topic_user_feedback.py +++ b/zeeguu/core/model/article_topic_user_feedback.py @@ -30,21 +30,23 @@ class ArticleTopicUserFeedback(db.Model): user_id = db.Column(db.Integer, db.ForeignKey(User.id)) user = relationship(User) - new_topic_id = db.Column(db.Integer, db.ForeignKey(Topic.id)) - new_topic = relationship(Topic) + topic_id = db.Column(db.Integer, db.ForeignKey(Topic.id)) + topic = relationship(Topic) feedback = db.Column(db.String(50)) - UniqueConstraint(article_id, user_id, new_topic_id) + UniqueConstraint(article_id, user_id, topic_id) def __init__(self, article, user, topic, feedback): self.article = article self.user = user - self.new_topic = topic + self.topic = topic self.feedback = feedback def __str__(self): - return f"User New Topic Feedback ({self.user.name}, {self.new_topic}: {self.feedback})" + return ( + f"User New Topic Feedback ({self.user.name}, {self.topic}: {self.feedback})" + ) __repr__ = __str__ @@ -54,7 +56,7 @@ def find_or_create(cls, session, article, user, topic, feedback): return ( cls.query.filter(cls.article == article) .filter(cls.user == user) - .filter(cls.new_topic == topic) + .filter(cls.topic == topic) .filter(cls.article == article) .filter(cls.feedback == feedback) .one() @@ -91,9 +93,9 @@ def with_id(cls, i): return (cls.query.filter(cls.id == i)).one() @classmethod - def with_topic_id(cls, new_topic_id, user): + def with_topic_id(cls, topic_id, user): return ( - (cls.query.filter(cls.new_topic_id == new_topic_id)) + (cls.query.filter(cls.topic_id == topic_id)) .filter(cls.user_id == user.id) .one() ) diff --git a/zeeguu/core/model/topic.py b/zeeguu/core/model/topic.py index 28c3df5e..d743358d 100644 --- a/zeeguu/core/model/topic.py +++ b/zeeguu/core/model/topic.py @@ -19,12 +19,12 @@ class Topic(db.Model): """ __table_args__ = {"mysql_collate": "utf8_bin"} - __tablename__ = "new_topic" + __tablename__ = "topic" id = Column(Integer, primary_key=True) title = Column(String(64)) - articles = relationship("ArticleTopicMap", back_populates="new_topic") + articles = relationship("ArticleTopicMap", back_populates="topic") language_topic_available_cache = {} def __init__(self, title): diff --git a/zeeguu/core/model/topic_filter.py b/zeeguu/core/model/topic_filter.py index aea89670..c1d491bd 100644 --- a/zeeguu/core/model/topic_filter.py +++ b/zeeguu/core/model/topic_filter.py @@ -21,33 +21,31 @@ class TopicFilter(db.Model): """ __table_args__ = {"mysql_collate": "utf8_bin"} - __tablename__ = "new_topic_filter" + __tablename__ = "topic_filter" id = db.Column(db.Integer, primary_key=True) user_id = db.Column(db.Integer, db.ForeignKey(User.id)) user = relationship(User) - new_topic_id = db.Column(db.Integer, db.ForeignKey(Topic.id)) - new_topic = relationship(Topic) + topic_id = db.Column(db.Integer, db.ForeignKey(Topic.id)) + topic = relationship(Topic) - UniqueConstraint(user_id, new_topic_id) + UniqueConstraint(user_id, topic_id) def __init__(self, user, topic): self.user = user - self.new_topic = topic + self.topic = topic def __str__(self): - return f"Topic filter ({self.user.name}, {self.new_topic})" + return f"Topic filter ({self.user.name}, {self.topic})" __repr__ = __str__ @classmethod def find_or_create(cls, session, user, topic): try: - return ( - cls.query.filter(cls.user == user).filter(cls.new_topic == topic).one() - ) + return cls.query.filter(cls.user == user).filter(cls.topic == topic).one() except sqlalchemy.orm.exc.NoResultFound: new = cls(user, topic) session.add(new) @@ -69,7 +67,5 @@ def with_id(cls, i): @classmethod def with_topic_id(cls, i, user): return ( - (cls.query.filter(cls.new_topic_id == i)) - .filter(cls.user_id == user.id) - .one() + (cls.query.filter(cls.topic_id == i)).filter(cls.user_id == user.id).one() ) diff --git a/zeeguu/core/model/topic_subscription.py b/zeeguu/core/model/topic_subscription.py index 76115bd6..8d6a3c85 100644 --- a/zeeguu/core/model/topic_subscription.py +++ b/zeeguu/core/model/topic_subscription.py @@ -21,33 +21,31 @@ class TopicSubscription(db.Model): """ __table_args__ = {"mysql_collate": "utf8_bin"} - __tablename__ = "new_topic_subscription" + __tablename__ = "topic_subscription" id = db.Column(db.Integer, primary_key=True) user_id = db.Column(db.Integer, db.ForeignKey(User.id)) user = relationship(User) - new_topic_id = db.Column(db.Integer, db.ForeignKey(Topic.id)) - new_topic = relationship(Topic) + topic_id = db.Column(db.Integer, db.ForeignKey(Topic.id)) + topic = relationship(Topic) - UniqueConstraint(user_id, new_topic_id) + UniqueConstraint(user_id, topic_id) def __init__(self, user, topic): self.user = user - self.new_topic = topic + self.topic = topic def __str__(self): - return f"Topic subscription ({self.user.name}, {self.new_topic})" + return f"Topic subscription ({self.user.name}, {self.topic})" __repr__ = __str__ @classmethod def find_or_create(cls, session, user, topic): try: - return ( - cls.query.filter(cls.user == user).filter(cls.new_topic == topic).one() - ) + return cls.query.filter(cls.user == user).filter(cls.topic == topic).one() except sqlalchemy.orm.exc.NoResultFound: new = cls(user, topic) session.add(new) @@ -68,7 +66,5 @@ def with_id(cls, i): @classmethod def with_topic_id(cls, i, user): return ( - (cls.query.filter(cls.new_topic_id == i)) - .filter(cls.user_id == user.id) - .one() + (cls.query.filter(cls.topic_id == i)).filter(cls.user_id == user.id).one() ) diff --git a/zeeguu/core/model/url_keyword.py b/zeeguu/core/model/url_keyword.py index 82e68845..0c3e2e67 100644 --- a/zeeguu/core/model/url_keyword.py +++ b/zeeguu/core/model/url_keyword.py @@ -59,16 +59,16 @@ class UrlKeyword(db.Model): language_id = db.Column(db.Integer, db.ForeignKey(Language.id)) language = relationship(Language) - new_topic_id = db.Column(db.Integer, db.ForeignKey(Topic.id)) - new_topic = relationship(Topic) + topic_id = db.Column(db.Integer, db.ForeignKey(Topic.id)) + topic = relationship(Topic) keyword = db.Column(db.String(45)) articles = relationship("ArticleUrlKeywordMap", back_populates="url_keyword") - def __init__(self, keyword: str, language: Language, new_topic: Topic = None): + def __init__(self, keyword: str, language: Language, topic: Topic = None): self.language = language - self.new_topic = new_topic + self.topic = topic self.keyword = keyword def __str__(self): @@ -80,9 +80,7 @@ def get_keyword(self): __repr__ = __str__ @classmethod - def find_or_create( - cls, session, keyword, language: Language, new_topic: Topic = None - ): + def find_or_create(cls, session, keyword, language: Language, topic: Topic = None): try: return ( cls.query.filter(cls.keyword == keyword) @@ -90,7 +88,7 @@ def find_or_create( .one() ) except sqlalchemy.orm.exc.NoResultFound: - new = cls(keyword, language, new_topic) + new = cls(keyword, language, topic) session.add(new) session.commit() return new diff --git a/zeeguu/core/model/user_article.py b/zeeguu/core/model/user_article.py index f048098f..337580ec 100644 --- a/zeeguu/core/model/user_article.py +++ b/zeeguu/core/model/user_article.py @@ -258,26 +258,26 @@ def user_article_info( user_diff_feedback = ArticleDifficultyFeedback.find(user, article) - user_new_topics_feedback = ArticleTopicUserFeedback.find_given_user_article( + user_topics_feedback = ArticleTopicUserFeedback.find_given_user_article( article, user ) - if user_new_topics_feedback: - article_topic_list = returned_info["new_topics_list"] - new_topic_list = [] + if user_topics_feedback: + article_topic_list = returned_info["topics_list"] + topic_list = [] topics_to_remove = set( [ - untf.new_topic.title - for untf in user_new_topics_feedback + untf.topic.title + for untf in user_topics_feedback if untf.feedback == ArticleTopicUserFeedback.DO_NOT_SHOW_FEEDBACK ] ) for each in article_topic_list: title, _ = each if title not in topics_to_remove: - new_topic_list.append(each) - returned_info["new_topics_list"] = new_topic_list - returned_info["new_topics"] = ",".join([t for t, _ in new_topic_list]) + topic_list.append(each) + returned_info["topics_list"] = topic_list + returned_info["topics"] = ",".join([t for t, _ in topic_list]) if not user_article_info: returned_info["starred"] = False From 1b1738c003f11f11e67cc99043f95983f3a9f885 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Tue, 12 Nov 2024 15:23:29 +0100 Subject: [PATCH 50/71] Renaming article_topic_user_feedback column - Match the new topic --- tools/migrations/24-11-12-rename_new_topics.sql | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/migrations/24-11-12-rename_new_topics.sql b/tools/migrations/24-11-12-rename_new_topics.sql index c3138b54..a7b867e3 100644 --- a/tools/migrations/24-11-12-rename_new_topics.sql +++ b/tools/migrations/24-11-12-rename_new_topics.sql @@ -65,6 +65,18 @@ ADD ALTER TABLE `zeeguu_test`.`new_article_topic_map` RENAME TO `zeeguu_test`.`article_topic_map`; +/*article_topic_user_feedback, update reference column name*/ +ALTER TABLE + `zeeguu_test`.`article_topic_user_feedback` DROP FOREIGN KEY `article_topic_user_feedback_ibfk_3`; + +ALTER TABLE + `zeeguu_test`.`article_topic_user_feedback` CHANGE COLUMN `new_topic_id` `topic_id` INT NULL DEFAULT NULL; + +ALTER TABLE + `zeeguu_test`.`article_topic_user_feedback` +ADD + CONSTRAINT `article_topic_user_feedback_ibfk_3` FOREIGN KEY (`topic_id`) REFERENCES `zeeguu_test`.`topic` (`id`); + /* url_keyword, update reference column name */ ALTER TABLE `zeeguu_test`.`url_keyword` DROP FOREIGN KEY `url_keyword_ibfk_2`; From 7f20a0db2284ff1c1d79fa8a88ece8910606b70d Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Tue, 12 Nov 2024 15:23:48 +0100 Subject: [PATCH 51/71] Remove ES7 checks - We no longer need to check for ES7 in the code --- zeeguu/core/content_recommender/elastic_recommender.py | 2 -- zeeguu/core/elastic/indexing.py | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/zeeguu/core/content_recommender/elastic_recommender.py b/zeeguu/core/content_recommender/elastic_recommender.py index 08fb4588..3a09404c 100644 --- a/zeeguu/core/content_recommender/elastic_recommender.py +++ b/zeeguu/core/content_recommender/elastic_recommender.py @@ -127,7 +127,6 @@ def article_recommendations_for_user( ) = _prepare_user_constraints(user) es = Elasticsearch(ES_CONN_STRING) - es_version = int(es.info()["version"]["number"][0]) # build the query using elastic_query_builder query_body = build_elastic_recommender_query( @@ -143,7 +142,6 @@ def article_recommendations_for_user( topics_to_include=topics_to_include, topics_to_exclude=topics_to_exclude, page=page, - is_es_v7=es_version == 7, ) res = es.search(index=ES_ZINDEX, body=query_body) diff --git a/zeeguu/core/elastic/indexing.py b/zeeguu/core/elastic/indexing.py index 1699774e..ba5ff91c 100644 --- a/zeeguu/core/elastic/indexing.py +++ b/zeeguu/core/elastic/indexing.py @@ -65,8 +65,7 @@ def document_from_article(article, session): def create_or_update(article, session): es = Elasticsearch(ES_CONN_STRING) - es_version = int(es.info()["version"]["number"][0]) - doc = document_from_article(article, session, is_v7=es_version == 7) + doc = document_from_article(article, session) if es.exists(index=ES_ZINDEX, id=article.id): es.delete(index=ES_ZINDEX, id=article.id) From 21856ebc531f41f48b2007e17befd638c6a2528e Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Wed, 13 Nov 2024 12:15:54 +0100 Subject: [PATCH 52/71] Update users_recently_active.py --- tools/users_recently_active.py | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/tools/users_recently_active.py b/tools/users_recently_active.py index 14c6dbf1..5cd948f6 100755 --- a/tools/users_recently_active.py +++ b/tools/users_recently_active.py @@ -6,6 +6,10 @@ """ +DAYS_SINCE_ACTIVE = 30 +SHOW_TEACHER_NAMES = False +SHOW_STUDENT_NAMES = False + from zeeguu.api.app import create_app app = create_app() @@ -17,7 +21,7 @@ from zeeguu.core.model import User -for user_id in User.all_recent_user_ids(): +for user_id in User.all_recent_user_ids(DAYS_SINCE_ACTIVE): user = User.find_by_id(user_id) # print(f"{user.name} ({user.email})") for ucmap in user.cohorts: @@ -26,14 +30,29 @@ cohort_student_map[ucmap.cohort].append(user.name) # print("") -for cohort, values in cohort_student_map.items(): - print(f"============================") - print(f"{cohort.name} ({cohort.language.code if cohort.language else ''})") - print(f"============================") + +ordered_cohorts = sorted( + cohort_student_map.keys(), key=lambda x: len(cohort_student_map[x]), reverse=True +) + +print(f"Users active in the last {DAYS_SINCE_ACTIVE} days") +for cohort in ordered_cohorts: + values = cohort_student_map[cohort] + print(f"========================================================") + print( + f"{cohort.name} ({cohort.id}) " + f"\nLang: {cohort.language.code if cohort.language else ''} " + f"\nInv Code: {cohort.inv_code} " + f"\nActive Students: {len(values)}" + ) + print(f"========================================================") for teacher in cohort.get_teachers(): - print(f" {teacher.name} ({teacher.email})") + if SHOW_TEACHER_NAMES: + print(f" {teacher.name} ({teacher.email})") + for v in values: - print(" - ", v) + if SHOW_STUDENT_NAMES: + print(" - ", v) print(" ") From ec4e32af88c4856da957d013f980b37e09f1c82f Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Wed, 13 Nov 2024 12:17:30 +0100 Subject: [PATCH 53/71] Update users_recently_active.py --- tools/users_recently_active.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/users_recently_active.py b/tools/users_recently_active.py index 5cd948f6..13a74395 100755 --- a/tools/users_recently_active.py +++ b/tools/users_recently_active.py @@ -36,6 +36,7 @@ ) print(f"Users active in the last {DAYS_SINCE_ACTIVE} days") +total_users = 0 for cohort in ordered_cohorts: values = cohort_student_map[cohort] print(f"========================================================") @@ -52,7 +53,10 @@ print(f" {teacher.name} ({teacher.email})") for v in values: + total_users += 1 if SHOW_STUDENT_NAMES: print(" - ", v) print(" ") + +print("Total users: ", total_users) From c4c43876682ccb7a96c0c779f68b15262582597f Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Wed, 13 Nov 2024 12:29:19 +0100 Subject: [PATCH 54/71] formatting --- tools/users_recently_active.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tools/users_recently_active.py b/tools/users_recently_active.py index 13a74395..1a782c55 100755 --- a/tools/users_recently_active.py +++ b/tools/users_recently_active.py @@ -7,8 +7,8 @@ """ DAYS_SINCE_ACTIVE = 30 -SHOW_TEACHER_NAMES = False -SHOW_STUDENT_NAMES = False +SHOW_TEACHER_NAMES = True +SHOW_STUDENT_NAMES = True from zeeguu.api.app import create_app @@ -39,18 +39,23 @@ total_users = 0 for cohort in ordered_cohorts: values = cohort_student_map[cohort] + print("") print(f"========================================================") print( f"{cohort.name} ({cohort.id}) " - f"\nLang: {cohort.language.code if cohort.language else ''} " - f"\nInv Code: {cohort.inv_code} " - f"\nActive Students: {len(values)}" + f"\nLang: {cohort.language.name if cohort.language else ''} " + f"\nCode: {cohort.inv_code} " ) - print(f"========================================================") + if SHOW_TEACHER_NAMES: + print("\nTeachers: ") for teacher in cohort.get_teachers(): if SHOW_TEACHER_NAMES: - print(f" {teacher.name} ({teacher.email})") + print(f" - {teacher.name} ({teacher.email})") + if SHOW_TEACHER_NAMES: + print("") + + print(f"Active Students: {len(values)}") for v in values: total_users += 1 From 3f418a559d70495da48517badf950650b0f53e7b Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Thu, 14 Nov 2024 11:01:53 +0100 Subject: [PATCH 55/71] Renamed test to match class name --- zeeguu/core/test/{test_topic_keywords.py => test_url_keywords.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename zeeguu/core/test/{test_topic_keywords.py => test_url_keywords.py} (100%) diff --git a/zeeguu/core/test/test_topic_keywords.py b/zeeguu/core/test/test_url_keywords.py similarity index 100% rename from zeeguu/core/test/test_topic_keywords.py rename to zeeguu/core/test/test_url_keywords.py From 42201e85966a4b6078eee94131d4c69d3e7dc281 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Thu, 14 Nov 2024 11:04:48 +0100 Subject: [PATCH 56/71] Update generate_report.py - Added a way to visualize how many of the topics are inferred / assigned keywords. --- tools/report_generator/generate_report.py | 49 +++++++++++++++++++---- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/tools/report_generator/generate_report.py b/tools/report_generator/generate_report.py index cd8ebf68..ccb1fcbb 100644 --- a/tools/report_generator/generate_report.py +++ b/tools/report_generator/generate_report.py @@ -79,8 +79,12 @@ def save_fig_params(filename): def get_new_repeating_sents_table(pd_repeating_sents): return generate_html_table(pd_repeating_sents.sort_values("Count", ascending=False)) + def get_new_url_keywords_table(pd_url_keywords_count): - return generate_html_table(pd_url_keywords_count.sort_values("count", ascending=False)) + return generate_html_table( + pd_url_keywords_count.sort_values("count", ascending=False) + ) + def get_rejected_sentences_table(total_deleted_sents): total_deleted_sents["Total"] = sum(total_deleted_sents.values()) @@ -214,21 +218,50 @@ def generate_topic_coverage_plot(article_df, article_with_topics_df): def generate_new_topic_coverage_plot(article_df, article_with_topics_df): filename = f"new_topic_coverage_plot_{date_str}_d{DAYS_FOR_REPORT}.png" - article_df["has_topic"] = "No" - article_df.loc[article_df.id.isin(article_with_topics_df.id), "has_topic"] = "Yes" + article_df.loc[:, "Has Topic"] = "No Topic" + from zeeguu.core.model.new_article_topic_map import TopicOriginType + + article_df.loc[ + article_df.id.isin( + article_with_topics_df[ + article_with_topics_df.origin_type == TopicOriginType.HARDSET + ].id + ), + "Has Topic", + ] = "Hardset Topic" + article_df.loc[ + article_df.id.isin( + article_with_topics_df[ + article_with_topics_df.origin_type == TopicOriginType.URL_PARSED + ].id + ), + "Has Topic", + ] = "Url Keyword Topic" + article_df.loc[ + article_df.id.isin( + article_with_topics_df[ + article_with_topics_df.origin_type == TopicOriginType.INFERRED + ].id + ), + "Has Topic", + ] = "Inferred Topic" + articles_with_topics = ( - article_df.groupby("Language") - .has_topic.value_counts(normalize=True) + article_df.groupby("Language")["Has Topic"] + .value_counts(normalize=True) .reset_index() ) + sns.barplot( x="Language", y="proportion", - hue="has_topic", + hue="Has Topic", data=articles_with_topics, palette={ - "Yes": sns.color_palette("vlag")[0], - "No": sns.color_palette("vlag")[5], + "Inferred Topic": sns.color_palette("vlag")[2], + "Url Keyword Topic": sns.color_palette("vlag")[0], + "Hardset Topic": sns.color_palette("vlag")[1], + "No Topic": sns.color_palette("vlag")[5], }, ) plt.title("Proportion of Articles with New Topics") From a03b2d0c07c005fd3fec77ed989a111ed64b2639 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Thu, 14 Nov 2024 12:09:37 +0100 Subject: [PATCH 57/71] Update generate_report.py - Remove new_ in variable names --- tools/report_generator/generate_report.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tools/report_generator/generate_report.py b/tools/report_generator/generate_report.py index bf939fe9..605e0fe9 100644 --- a/tools/report_generator/generate_report.py +++ b/tools/report_generator/generate_report.py @@ -515,7 +515,7 @@ def generate_html_page(): feed_df = data_extractor.get_feed_df() article_df = data_extractor.get_article_df(feed_df) - new_article_topics_df = data_extractor.get_article_topics_df(feed_df) + article_topics_df = data_extractor.get_article_topics_df(feed_df) language_df = data_extractor.get_language_df() bookmark_df = data_extractor.get_bookmark_df() data_extractor.add_stats_to_feed(feed_df, article_df) @@ -528,7 +528,7 @@ def generate_html_page(): user_exercise_time_df, user_reading_time_df ) ) - new_topic_reading_time_df = data_extractor.get_topic_reading_time() + topic_reading_time_df = data_extractor.get_topic_reading_time() total_unique_articles_opened_by_users = len( article_df[article_df.id.isin(user_reading_time_df.id)] ) @@ -539,7 +539,7 @@ def generate_html_page(): top_subscribed_searches = data_extractor.get_top_search_subscriptions() top_filtered_searches = data_extractor.get_top_search_filters() newly_added_search_subscriptions = data_extractor.get_added_search_subscriptions() - pd_new_url_keywords = data_extractor.get_url_keyword_counts() + pd_url_keywords = data_extractor.get_url_keyword_counts() pd_feed_innactivity_time = data_extractor.get_days_since_last_crawl() crawl_report = CrawlReport() crawl_report.load_crawl_report_data(DAYS_FOR_REPORT) @@ -556,7 +556,7 @@ def generate_html_page(): else f"WARNING! This date only contains values from the last '{total_days_from_crawl_report}' day(s)." ) ACTIVE_USER_ACTIVITY_TIME_MIN = 1 - articles_with_new_topic_count = len(new_article_topics_df.id.unique()) + articles_with_topic_count = len(article_topics_df.id.unique()) active_users = combined_user_activity_df[ ( combined_user_activity_df["total_reading_time"] @@ -573,14 +573,14 @@ def generate_html_page(): lang_report += f"""

{lang}

Articles Downloaded

- +

User Activity

""" if lang in active_users["Language"].values: lang_report += f"""

Total Active users: {len(active_users[active_users["Language"] == lang])}

- + @@ -619,8 +619,8 @@ def generate_html_page():

Total Articles Crawled: {len(article_df)}

Total Unique Articles Opened: {total_unique_articles_opened_by_users} -

New Topic Coverage: {((articles_with_new_topic_count / len(article_df)) * 100) if len(article_df) > 0 else 0:.2f}%

- +

New Topic Coverage: {((articles_with_topic_count / len(article_df)) * 100) if len(article_df) > 0 else 0:.2f}%

+

Possible Innactive feeds:

Full table

@@ -656,7 +656,7 @@ def generate_html_page(): {generate_top_opened_articles(user_reading_time_df, data_extractor, feed_df)} - + """ result += f""" @@ -679,7 +679,7 @@ def generate_html_page(): result += f"""

Newly url keywords without topics:

URL Keywords that occur more than 100 times in articles and are not mapped to a topic. They are language unique.

- {get_url_keywords_table(pd_new_url_keywords) if DAYS_FOR_REPORT <= 7 else "

Skipped due to long period.

"} + {get_url_keywords_table(pd_url_keywords) if DAYS_FOR_REPORT <= 7 else "

Skipped due to long period.

"}

Feed activity:

{generate_html_table(pd_feed_innactivity_time)} From 0c5cb96a2bed1ab9b16682edbf348c7345f83a7d Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Thu, 14 Nov 2024 16:33:14 +0100 Subject: [PATCH 58/71] Update 24-11-12-rename_new_topics.sql - Fix SQL error --- tools/migrations/24-11-12-rename_new_topics.sql | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/migrations/24-11-12-rename_new_topics.sql b/tools/migrations/24-11-12-rename_new_topics.sql index a7b867e3..8e513802 100644 --- a/tools/migrations/24-11-12-rename_new_topics.sql +++ b/tools/migrations/24-11-12-rename_new_topics.sql @@ -9,8 +9,7 @@ ALTER TABLE `zeeguu_test`.`new_topic_filter` DROP FOREIGN KEY `new_topic_filter_ibfk_2`; ALTER TABLE - `zeeguu_test`.`new_topic_filter` CHANGE COLUMN `new_topic_id` `topic_id` INT NULL DEFAULT NULL, - RENAME TO `zeeguu_test`.`topic_filter`; + `zeeguu_test`.`new_topic_filter` CHANGE COLUMN `new_topic_id` `topic_id` INT NULL DEFAULT NULL; ALTER TABLE `zeeguu_test`.`new_topic_filter` From 98867b4363cd8a5e487636b75c4c11590529153d Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Fri, 15 Nov 2024 10:18:27 +0100 Subject: [PATCH 59/71] Added equals test for Topics --- zeeguu/core/model/topic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/zeeguu/core/model/topic.py b/zeeguu/core/model/topic.py index d743358d..e6a370ca 100644 --- a/zeeguu/core/model/topic.py +++ b/zeeguu/core/model/topic.py @@ -33,6 +33,9 @@ def __init__(self, title): def __repr__(self): return f"" + def __eq__(self, other): + return self.id == other.id and self.title == other.title + def as_dictionary(self): return dict( From c31f587f00d051eb516c9ba21f281f23bddc00a1 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Fri, 15 Nov 2024 10:18:44 +0100 Subject: [PATCH 60/71] Added test for adding article topics --- zeeguu/core/test/rules/topic_rule.py | 52 ++++++++++++++++++++++++ zeeguu/core/test/test_article.py | 14 ++++++- zeeguu/core/test/test_localized_topic.py | 50 ----------------------- 3 files changed, 65 insertions(+), 51 deletions(-) create mode 100644 zeeguu/core/test/rules/topic_rule.py delete mode 100644 zeeguu/core/test/test_localized_topic.py diff --git a/zeeguu/core/test/rules/topic_rule.py b/zeeguu/core/test/rules/topic_rule.py new file mode 100644 index 00000000..4a82087d --- /dev/null +++ b/zeeguu/core/test/rules/topic_rule.py @@ -0,0 +1,52 @@ +import random + +from sqlalchemy.exc import OperationalError +from sqlalchemy.orm.exc import NoResultFound, ObjectDeletedError + +from zeeguu.core.test.rules.base_rule import BaseRule +from zeeguu.core.model.topic import Topic + + +class TopicRule(BaseRule): + """A Testing Rule class for model class zeeguu.core.model.Topics + + Has all supported languages as properties. Languages are created and + saved to the database if they don't yet exist in the database. + """ + + topics = { + 1: "Sports", + 2: "Culture & Art", + 3: "Technology & Science", + 4: "Travel & Tourism", + 5: "Health & Society", + 6: "Business", + 7: "Politics", + 8: "Satire", + } + + @classmethod + def get_or_create_topic(cls, topic_id): + topic = Topic.find_by_id(topic_id) + if topic: + return topic + else: + return TopicRule.__create_new_topic(topic_id) + + @classmethod + def __create_new_topic(cls, topic_id): + topic_name = cls.topics.get(topic_id) + + if topic_name is None: + raise KeyError + + new_topic = Topic(topic_name) + + cls.save(new_topic) + + return new_topic + + @property + def random(self): + random_id, __ = random.choice(list(self.topics.items())) + return self.get_or_create_language(random_id) diff --git a/zeeguu/core/test/test_article.py b/zeeguu/core/test/test_article.py index 0c2addf2..9ae8ee19 100644 --- a/zeeguu/core/test/test_article.py +++ b/zeeguu/core/test/test_article.py @@ -3,9 +3,11 @@ from zeeguu.core.test.model_test_mixin import ModelTestMixIn import zeeguu.core +from zeeguu.core.model.article_topic_map import TopicOriginType from zeeguu.core.test.rules.article_rule import ArticleRule from zeeguu.core.test.rules.language_rule import LanguageRule -from zeeguu.core.model import Article +from zeeguu.core.test.rules.topic_rule import TopicRule +from zeeguu.core.model import Article, Topic from zeeguu.core.test.mocking_the_web import ( URL_CNN_KATHMANDU, URL_SPIEGEL_VENEZUELA, @@ -21,6 +23,16 @@ def setUp(self): self.article2 = ArticleRule().article self.language = LanguageRule.get_or_create_language("en") + def test_add_topic(self): + sports = TopicRule.get_or_create_topic(1) + health_society = TopicRule.get_or_create_topic(5) + self.article1.add_topic(health_society, session, TopicOriginType.HARDSET) + self.article1.add_topic(sports, session, TopicOriginType.HARDSET) + assert len(self.article1.topics) == 2 + article_topics = [atm.topic for atm in self.article1.topics] + assert sports in article_topics + assert health_society in article_topics + def test_articles_are_different(self): assert self.article1.title != self.article2.title diff --git a/zeeguu/core/test/test_localized_topic.py b/zeeguu/core/test/test_localized_topic.py deleted file mode 100644 index 079e702d..00000000 --- a/zeeguu/core/test/test_localized_topic.py +++ /dev/null @@ -1,50 +0,0 @@ -from unittest import TestCase - -import zeeguu.core -from sqlalchemy.orm.exc import NoResultFound - -from zeeguu.core.model import Topic, LocalizedTopic, Article, Url -from zeeguu.core.test.model_test_mixin import ModelTestMixIn -from zeeguu.core.test.rules.article_rule import ArticleRule -from zeeguu.core.test.rules.language_rule import LanguageRule -from zeeguu.core.test.rules.url_rule import UrlRule -from zeeguu.core.test.rules.user_rule import UserRule -from zeeguu.core.model.language import Language -from zeeguu.core.model import db - -db_session = zeeguu.core.model.db.session - - -class LocalizedTopicTest(ModelTestMixIn, TestCase): - def setUp(self): - super().setUp() - self.user = UserRule().user - - def test_topic_matching(self): - self._localized_url_keywords_in_url( - "World", - "World", - "theguardian.com/world", - "https://www.theguardian.com/world/2020/jun/06/new-zealand-readers", - ) - - def test_topic_matching_is_case_sensitive(self): - self._localized_url_keywords_in_url( - "Music", - "Muziek", - "the-Voice", - "https://www.nu.nl/media/6056161/winnaar-negende-seizoen-van-the-Voice-kids-bekend.html", - ) - - def _localized_url_keywords_in_url( - self, topic: str, localized: str, keyword: str, url: str - ): - topic = Topic(topic) - localized_topic = LocalizedTopic(topic, self.user.learned_language, localized) - localized_topic.keywords = keyword - - article = ArticleRule().article - url = Url.find_or_create(db.session, url) - article.url = url - - assert localized_topic.matches_article(article) From 6df021f665838f0b750b485c018bc90b2467c3ca Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Fri, 15 Nov 2024 13:19:50 +0100 Subject: [PATCH 61/71] Update tests - Tests should do similar to what they used to do in the past. --- zeeguu/core/test/test_article.py | 12 +++++------ zeeguu/core/test/test_retrieve_and_compute.py | 20 +++++++++++++++++++ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/zeeguu/core/test/test_article.py b/zeeguu/core/test/test_article.py index 9ae8ee19..83a60475 100644 --- a/zeeguu/core/test/test_article.py +++ b/zeeguu/core/test/test_article.py @@ -23,6 +23,12 @@ def setUp(self): self.article2 = ArticleRule().article self.language = LanguageRule.get_or_create_language("en") + def test_articles_are_different(self): + assert self.article1.title != self.article2.title + + def test_article_representation_does_not_error(self): + assert self.article1.article_info() + def test_add_topic(self): sports = TopicRule.get_or_create_topic(1) health_society = TopicRule.get_or_create_topic(5) @@ -33,12 +39,6 @@ def test_add_topic(self): assert sports in article_topics assert health_society in article_topics - def test_articles_are_different(self): - assert self.article1.title != self.article2.title - - def test_article_representation_does_not_error(self): - assert self.article1.article_info() - def test_find_or_create(self): self.new_art = Article.find_or_create(session, URL_SPIEGEL_VENEZUELA) assert self.new_art.fk_difficulty diff --git a/zeeguu/core/test/test_retrieve_and_compute.py b/zeeguu/core/test/test_retrieve_and_compute.py index c08afc88..4fac77f5 100644 --- a/zeeguu/core/test/test_retrieve_and_compute.py +++ b/zeeguu/core/test/test_retrieve_and_compute.py @@ -5,6 +5,8 @@ from zeeguu.core.test.rules.language_rule import LanguageRule from zeeguu.core.test.rules.feed_rule import FeedRule from zeeguu.core.test.rules.user_rule import UserRule +from zeeguu.core.test.rules.topic_rule import TopicRule +from zeeguu.core.model.url_keyword import UrlKeyword from zeeguu.core.content_cleaning.content_cleaner import cleanup_non_content_bits from zeeguu.core.content_retriever.article_downloader import download_from_feed from zeeguu.core.content_quality.quality_filter import ( @@ -34,6 +36,24 @@ def testDifficultyOfFeedItems(self): assert len(articles) == 2 assert articles[0].fk_difficulty + def testDownloadWithTopic(self): + ## Check if topic associated with the keyword is correctly added. + feed = FeedRule().feed1 + topic = TopicRule.get_or_create_topic(7) + url_keyword = UrlKeyword.find_or_create( + zeeguu.core.model.db.session, "politik", self.lan, topic + ) + crawl_report = CrawlReport() + crawl_report.add_feed(feed) + download_from_feed(feed, zeeguu.core.model.db.session, crawl_report, 3, False) + + article = feed.get_articles(limit=2)[0] + # http://www.spiegel.de/politik/ausland/venezuela-militaer-unterstuetzt-nicolas-maduro-im-machtkampf-gegen-juan-guaido-a-1249616.html + # + # + assert url_keyword in [aukm.url_keyword for aukm in article.url_keywords] + assert topic in [atm.topic for atm in article.topics] + def test_sufficient_quality(self): art = newspaper.Article(URL_PROPUBLICA_INVESTING) art.download() From 5f5173bbfd680a117a5d88c93682de7db48dad1e Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Fri, 15 Nov 2024 13:48:24 +0100 Subject: [PATCH 62/71] Rollback changes to the migration code - Instead move them to the old folder. --- .../es_v8_migration/migrate_old_topics_to_new_topics.py | 6 +++--- .../es_v8_migration/set_new_topics_from_url_keyword.py | 8 ++++---- .../es_v8_migration/set_topic_mapping_to_keywords.py | 4 ++-- .../{ => old}/es_v8_migration/set_url_keywords_article.py | 0 tools/{ => old}/es_v8_migration/url_topics.py | 0 .../es_v8_migration/url_topics_count_with_pred_to_db.csv | 0 6 files changed, 9 insertions(+), 9 deletions(-) rename tools/{ => old}/es_v8_migration/migrate_old_topics_to_new_topics.py (88%) rename tools/{ => old}/es_v8_migration/set_new_topics_from_url_keyword.py (91%) rename tools/{ => old}/es_v8_migration/set_topic_mapping_to_keywords.py (88%) rename tools/{ => old}/es_v8_migration/set_url_keywords_article.py (100%) rename tools/{ => old}/es_v8_migration/url_topics.py (100%) rename tools/{ => old}/es_v8_migration/url_topics_count_with_pred_to_db.csv (100%) diff --git a/tools/es_v8_migration/migrate_old_topics_to_new_topics.py b/tools/old/es_v8_migration/migrate_old_topics_to_new_topics.py similarity index 88% rename from tools/es_v8_migration/migrate_old_topics_to_new_topics.py rename to tools/old/es_v8_migration/migrate_old_topics_to_new_topics.py index 8308c507..48f1ba3e 100644 --- a/tools/es_v8_migration/migrate_old_topics_to_new_topics.py +++ b/tools/old/es_v8_migration/migrate_old_topics_to_new_topics.py @@ -7,7 +7,7 @@ import zeeguu.core from zeeguu.api.app import create_app -from zeeguu.core.model import TopicSubscription, TopicSubscription, Topic +from zeeguu.core.model import TopicSubscription, NewTopicSubscription, NewTopic from tqdm import tqdm app = create_app() @@ -46,8 +46,8 @@ old_topic = topic_sub.topic new_topic_id = OLD_TOPIC_TO_NEW_TOPIC_MAP.get(old_topic.id, None) if new_topic_id: - new_topic = Topic.find_by_id(new_topic_id) - new_user_sub = TopicSubscription.find_or_create(db_session, user, new_topic) + new_topic = NewTopic.find_by_id(new_topic_id) + new_user_sub = NewTopicSubscription.find_or_create(db_session, user, new_topic) if VERBOSE: print( f"User {user.id}, was subscribed to '{old_topic.title}' and now is subscribed to: '{new_topic.title}'" diff --git a/tools/es_v8_migration/set_new_topics_from_url_keyword.py b/tools/old/es_v8_migration/set_new_topics_from_url_keyword.py similarity index 91% rename from tools/es_v8_migration/set_new_topics_from_url_keyword.py rename to tools/old/es_v8_migration/set_new_topics_from_url_keyword.py index 647b882b..f2a385d5 100644 --- a/tools/es_v8_migration/set_new_topics_from_url_keyword.py +++ b/tools/old/es_v8_migration/set_new_topics_from_url_keyword.py @@ -11,7 +11,7 @@ Article, ArticleUrlKeywordMap, UrlKeyword, - ArticleTopicMap, + NewArticleTopicMap, ) from tqdm import tqdm @@ -29,9 +29,9 @@ db_session.query(Article.id) .join(ArticleUrlKeywordMap) .join(UrlKeyword) - .join(ArticleTopicMap, isouter=True) - .filter(UrlKeyword.topic != None) - .filter(ArticleTopicMap.topic_id == None) + .join(NewArticleTopicMap, isouter=True) + .filter(UrlKeyword.new_topic != None) + .filter(NewArticleTopicMap.new_topic_id == None) .all() ) print("Adding topics based on url keywords to articles...") diff --git a/tools/es_v8_migration/set_topic_mapping_to_keywords.py b/tools/old/es_v8_migration/set_topic_mapping_to_keywords.py similarity index 88% rename from tools/es_v8_migration/set_topic_mapping_to_keywords.py rename to tools/old/es_v8_migration/set_topic_mapping_to_keywords.py index 7fbddade..e6a2a16e 100644 --- a/tools/es_v8_migration/set_topic_mapping_to_keywords.py +++ b/tools/old/es_v8_migration/set_topic_mapping_to_keywords.py @@ -1,6 +1,6 @@ import pandas as pd from zeeguu.core.model.url_keyword import UrlKeyword -from zeeguu.core.model.topic import Topic +from zeeguu.core.model.new_topic import NewTopic import zeeguu.core from tqdm import tqdm from zeeguu.api.app import create_app @@ -23,7 +23,7 @@ url_k_list = UrlKeyword.find_all_by_keyword(keyword) for url_k in url_k_list: topic_to_assign = ( - Topic.find_by_id(row["val_pred"]) if row["val_pred"] != -1 else None + NewTopic.find_by_id(row["val_pred"]) if row["val_pred"] != -1 else None ) url_k.new_topic = topic_to_assign db_session.add(url_k) diff --git a/tools/es_v8_migration/set_url_keywords_article.py b/tools/old/es_v8_migration/set_url_keywords_article.py similarity index 100% rename from tools/es_v8_migration/set_url_keywords_article.py rename to tools/old/es_v8_migration/set_url_keywords_article.py diff --git a/tools/es_v8_migration/url_topics.py b/tools/old/es_v8_migration/url_topics.py similarity index 100% rename from tools/es_v8_migration/url_topics.py rename to tools/old/es_v8_migration/url_topics.py diff --git a/tools/es_v8_migration/url_topics_count_with_pred_to_db.csv b/tools/old/es_v8_migration/url_topics_count_with_pred_to_db.csv similarity index 100% rename from tools/es_v8_migration/url_topics_count_with_pred_to_db.csv rename to tools/old/es_v8_migration/url_topics_count_with_pred_to_db.csv From cb2f41165d7f3640cfa949f2b477a7049aa1eabb Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Fri, 15 Nov 2024 13:51:05 +0100 Subject: [PATCH 63/71] Fixes to tools --- tools/evaluate_infer_topics.py | 6 ++++-- tools/extract_articles_with_new_topics.py | 12 +++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/evaluate_infer_topics.py b/tools/evaluate_infer_topics.py index 4d655ee8..faf203db 100644 --- a/tools/evaluate_infer_topics.py +++ b/tools/evaluate_infer_topics.py @@ -50,8 +50,10 @@ article_to_search, k_to_use ) - neighbouring_topics = [t.topic for a in a_found_t for t in a.topic] + neighbouring_topics = [t.topic.title for a in a_found_t for t in a.topics] neighbouring_keywords = [t.url_keyword for a in a_found_t for t in a.url_keywords] + if len(hits_t) == 0: + continue avg_score = sum([float(h["_score"]) for h in hits_t]) / len(hits_t) topics_counter = Counter(neighbouring_topics) @@ -68,7 +70,7 @@ threshold = ( sum(topics_counter.values()) // 2 ) # The threshold is being at least half or above rounded down - prediction = str(top_topic.title) if count >= threshold else "" + prediction = str(top_topic) if count >= threshold else "" print(f"Prediction: '{prediction}', Original: '{og_topics}'.") print(f"Pred Avg Score: {avg_score:.2f}, {len(hits_t)} K neigh.") print(f"Progress: {i+1}/{TOTAL_EXAMPLES}") diff --git a/tools/extract_articles_with_new_topics.py b/tools/extract_articles_with_new_topics.py index b65cd1fd..4a8fea41 100644 --- a/tools/extract_articles_with_new_topics.py +++ b/tools/extract_articles_with_new_topics.py @@ -24,13 +24,11 @@ articles_to_extract = [] for a in tqdm(articles, total=len(articles)): - tuple = ( - a.id, - a.content, - a.topic(), - a.topics[-1].topic.title, - ) - articles_to_extract.append(tuple) + tuple = [a.id, a.content, len(a.topics)] + topics_data = [] + for atm in a.topics: + topics_data += [atm.topic.title, atm.origin_type] + articles_to_extract.append((tuple + topics_data)) with open("data_for_eval_new_topic.json", "w+", encoding="utf-8") as f: f.write(json.dumps(articles_to_extract)) From 4a6e10739b7ea407f18699296a218cdfb82090dd Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Fri, 15 Nov 2024 14:35:24 +0100 Subject: [PATCH 64/71] Remove __eq__ method - Not needed --- tools/evaluate_infer_topics.py | 4 ++-- zeeguu/core/model/topic.py | 7 ++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/tools/evaluate_infer_topics.py b/tools/evaluate_infer_topics.py index faf203db..c58d1694 100644 --- a/tools/evaluate_infer_topics.py +++ b/tools/evaluate_infer_topics.py @@ -50,7 +50,7 @@ article_to_search, k_to_use ) - neighbouring_topics = [t.topic.title for a in a_found_t for t in a.topics] + neighbouring_topics = [t.topic for a in a_found_t for t in a.topics] neighbouring_keywords = [t.url_keyword for a in a_found_t for t in a.url_keywords] if len(hits_t) == 0: continue @@ -70,7 +70,7 @@ threshold = ( sum(topics_counter.values()) // 2 ) # The threshold is being at least half or above rounded down - prediction = str(top_topic) if count >= threshold else "" + prediction = str(top_topic.title) if count >= threshold else "" print(f"Prediction: '{prediction}', Original: '{og_topics}'.") print(f"Pred Avg Score: {avg_score:.2f}, {len(hits_t)} K neigh.") print(f"Progress: {i+1}/{TOTAL_EXAMPLES}") diff --git a/zeeguu/core/model/topic.py b/zeeguu/core/model/topic.py index e6a370ca..838dc6a7 100644 --- a/zeeguu/core/model/topic.py +++ b/zeeguu/core/model/topic.py @@ -10,9 +10,9 @@ class Topic(db.Model): """ - The New Topics are standerdized accross all languages. + The Topics are standerdized accross all languages. - Each UrlKeyword can be associated with one New Topic + Each UrlKeyword can be associated with one Topic which are used to infer topics in articles which haven't got any topic. This relationship is stored in ArticleTopicMap. @@ -33,9 +33,6 @@ def __init__(self, title): def __repr__(self): return f"" - def __eq__(self, other): - return self.id == other.id and self.title == other.title - def as_dictionary(self): return dict( From a9594e9cccb8e7d57738a90fb70f2ee5f1acb5f0 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Fri, 15 Nov 2024 14:35:42 +0100 Subject: [PATCH 65/71] Fixes to Tools --- tools/run_knn_topic_inferance.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/tools/run_knn_topic_inferance.py b/tools/run_knn_topic_inferance.py index 2098e216..ab230c8c 100644 --- a/tools/run_knn_topic_inferance.py +++ b/tools/run_knn_topic_inferance.py @@ -6,6 +6,7 @@ from zeeguu.core.model.article import Article from zeeguu.core.model.language import Language +from zeeguu.core.model.url_keyword import UrlKeyword from zeeguu.core.elastic.settings import ES_CONN_STRING, ES_ZINDEX from elasticsearch import Elasticsearch @@ -29,9 +30,6 @@ def search_similar_to_article(article_id): app = create_app() app.app_context().push() - - es = Elasticsearch(ES_CONN_STRING) - doc_to_search = article_id article_to_search = Article.find_by_id(doc_to_search) @@ -46,9 +44,8 @@ def search_similar_to_article(article_id): for hit in hits: print( hit["_id"], - hit["_source"]["old_topics"], hit["_source"]["language"], - f"New Topics: {hit['_source']['topics']}", + f"Topics: {hit['_source']['topics']}", hit["_source"].get("url_keywords", []), hit["_source"].get("url", ""), hit["_score"], @@ -59,9 +56,8 @@ def search_similar_to_article(article_id): for hit in hits_t: print( hit["_id"], - hit["_source"]["old_topics"], hit["_source"]["language"], - f"New Topics: {hit['_source']['topics']}", + f"Topics: {hit['_source']['topics']}", hit["_source"].get("url_keywords", []), hit["_source"].get("url", ""), hit["_score"], @@ -71,20 +67,19 @@ def search_similar_to_article(article_id): for hit in hits_lt: print( hit["_id"], - hit["_source"]["old_topics"], hit["_source"]["language"], - f"New Topics: {hit['_source']['topics']}", + f"Topics: {hit['_source']['topics']}", hit["_source"].get("url_keywords", []), hit["_source"].get("url", ""), hit["_score"], ) - neighbouring_topics = [t.topic for a in a_found_t for t in a.topics] - TOPICS_TO_NOT_COUNT = set(["news", "aktuell", "nyheder", "nieuws", "article"]) + neighbouring_topics = [t.topic.title for a in a_found_t for t in a.topics] + TOPICS_TO_NOT_COUNT = UrlKeyword.EXCLUDE_TOPICS neighbouring_keywords = [ - t.url_keywords + t.url_keyword for a in a_found_t for t in a.url_keywords - if t.url_keywords.keyword not in TOPICS_TO_NOT_COUNT + if t.url_keyword.keyword not in TOPICS_TO_NOT_COUNT ] print() From 375e7016bfbf7f2b10d8b6bad0c1dae9f43136a0 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Fri, 15 Nov 2024 14:36:10 +0100 Subject: [PATCH 66/71] Fixes & Renaming - Update run_knn_topic_inferance.py - Improved printing for run_knn_topic_inference --- tools/run_knn_topic_inferance.py | 26 +++++-------------- .../elastic_recommender.py | 6 ++--- .../content_retriever/article_downloader.py | 12 +++------ zeeguu/core/elastic/elastic_query_builder.py | 2 +- 4 files changed, 13 insertions(+), 33 deletions(-) diff --git a/tools/run_knn_topic_inferance.py b/tools/run_knn_topic_inferance.py index ab230c8c..2743397d 100644 --- a/tools/run_knn_topic_inferance.py +++ b/tools/run_knn_topic_inferance.py @@ -43,35 +43,20 @@ def search_similar_to_article(article_id): print("Similar articles:") for hit in hits: print( - hit["_id"], - hit["_source"]["language"], - f"Topics: {hit['_source']['topics']}", - hit["_source"].get("url_keywords", []), - hit["_source"].get("url", ""), - hit["_score"], + f"{hit["_id"]} {hit["_score"]:.4f} {hit["_source"]["language"]}, Topics: {hit['_source']['topics']} {hit["_source"].get("url_keywords", [])} {hit["_source"].get("url", "")}" ) print() print("Similar articles to classify:") for hit in hits_t: print( - hit["_id"], - hit["_source"]["language"], - f"Topics: {hit['_source']['topics']}", - hit["_source"].get("url_keywords", []), - hit["_source"].get("url", ""), - hit["_score"], + f"{hit["_id"]} {hit["_score"]:.4f} {hit["_source"]["language"]}, Topics: {hit['_source']['topics']} {hit["_source"].get("url_keywords", [])} {hit["_source"].get("url", "")}" ) print() print("More like this articles!:") for hit in hits_lt: print( - hit["_id"], - hit["_source"]["language"], - f"Topics: {hit['_source']['topics']}", - hit["_source"].get("url_keywords", []), - hit["_source"].get("url", ""), - hit["_score"], + f"{hit["_id"]} {hit["_score"]:.4f} {hit["_source"]["language"]}, Topics: {hit['_source']['topics']} {hit["_source"].get("url_keywords", [])} {hit["_source"].get("url", "")}" ) neighbouring_topics = [t.topic.title for a in a_found_t for t in a.topics] TOPICS_TO_NOT_COUNT = UrlKeyword.EXCLUDE_TOPICS @@ -91,8 +76,9 @@ def search_similar_to_article(article_id): print(topics_key_counter) print("Classification: ", topics_key_counter.most_common(1)[0]) print() - print(article_to_search.title[:100]) - print(article_to_search.content[:100]) + print("Title: ", article_to_search.title[:100]) + print("Content: ", article_to_search.content[:100]) + print() print("Top match content (sim): ") print(a_found_t[0].content[:100]) print("Top match content (sim, same lang): ") diff --git a/zeeguu/core/content_recommender/elastic_recommender.py b/zeeguu/core/content_recommender/elastic_recommender.py index 3a09404c..69f77ca7 100644 --- a/zeeguu/core/content_recommender/elastic_recommender.py +++ b/zeeguu/core/content_recommender/elastic_recommender.py @@ -50,7 +50,7 @@ def _prepare_user_constraints(user): unwanted_user_searches.append(user_search_filter.search.keywords) print(f"keywords to exclude: {unwanted_user_searches}") - # 2. New Topics to exclude / filter out + # 2. Topics to exclude / filter out # ================================= excluded_topics = TopicFilter.all_for_user(user) topics_to_exclude = [ @@ -58,7 +58,7 @@ def _prepare_user_constraints(user): ] print(f"Topics to exclude: {excluded_topics}") - # 3. New Topics subscribed, and thus to include + # 3. Topics subscribed, and thus to include # ========================================= topic_subscriptions = TopicSubscription.all_for_user(user) topics_to_include = [ @@ -66,7 +66,7 @@ def _prepare_user_constraints(user): for subscription in topic_subscriptions if subscription is not None ] - print(f"New Topics to include: {topic_subscriptions}") + print(f"Topics to include: {topic_subscriptions}") # 6. Wanted user topics # ========================================= diff --git a/zeeguu/core/content_retriever/article_downloader.py b/zeeguu/core/content_retriever/article_downloader.py index 0d34db8c..358597fa 100644 --- a/zeeguu/core/content_retriever/article_downloader.py +++ b/zeeguu/core/content_retriever/article_downloader.py @@ -16,12 +16,7 @@ from zeeguu.core import model -SEMANTIC_SEARCH_AVAILABLE = True -try: - from zeeguu.core.semantic_search import add_topics_based_on_semantic_hood_search -except: - SEMANTIC_SEARCH_AVAILABLE = False - print("######### Failed to load semantic search modules") +from zeeguu.core.semantic_search import add_topics_based_on_semantic_hood_search from zeeguu.core.content_quality.quality_filter import sufficient_quality from zeeguu.core.content_cleaning import cleanup_text_w_crawl_report from zeeguu.core.emailer.zeeguu_mailer import ZeeguuMailer @@ -333,9 +328,8 @@ def download_feed_item(session, feed, feed_item, url, crawl_report): url_keywords = add_url_keywords(new_article, session) logp(f"Topic Keywords: ({url_keywords})") - if SEMANTIC_SEARCH_AVAILABLE: - _, topics = add_topics(new_article, feed, url_keywords, session) - logp(f"New Topics ({topics})") + _, topics = add_topics(new_article, feed, url_keywords, session) + logp(f"Topics ({topics})") session.add(new_article) return new_article diff --git a/zeeguu/core/elastic/elastic_query_builder.py b/zeeguu/core/elastic/elastic_query_builder.py index c1afabb9..46e95cd7 100644 --- a/zeeguu/core/elastic/elastic_query_builder.py +++ b/zeeguu/core/elastic/elastic_query_builder.py @@ -38,7 +38,7 @@ def more_like_this_query(count, article_text, language, page=0): .filter("term", language=language.name.lower()) ) - return {"from": page * count, "size": count, "query": s.to_dict()} + return {"from": page * count, "size": count, "query": s.query.to_dict()} def build_elastic_recommender_query( From c03d4d930a9533ca760f72adf341722ea5722fd5 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Mon, 18 Nov 2024 09:52:07 +0100 Subject: [PATCH 67/71] Delete docker-compose-v8.yml - Removed ES8 compose file --- docker-compose-v8.yml | 168 ------------------------------------------ 1 file changed, 168 deletions(-) delete mode 100644 docker-compose-v8.yml diff --git a/docker-compose-v8.yml b/docker-compose-v8.yml deleted file mode 100644 index d9e6a963..00000000 --- a/docker-compose-v8.yml +++ /dev/null @@ -1,168 +0,0 @@ -version: "3.9" - -services: - dev_server: - image: zeeguu_api_dev - environment: - ZEEGUU_CONFIG: /Zeeguu-API/default_docker.cfg - ZEEGUU_ES_CONN_STRING: "http://elasticsearch_v8:9200" - ZEEGUU_EMB_API_CONN_STRING: "http://embedding_api:3654" - PYTHONUNBUFFERED: 1 - MICROSOFT_TRANSLATE_API_KEY: ${MICROSOFT_TRANSLATE_API_KEY} - GOOGLE_TRANSLATE_API_KEY: ${GOOGLE_TRANSLATE_API_KEY} - ports: - - 9001:9001 - volumes: - - .:/Zeeguu-API - - ./data/zeeguu/esV8:/zeeguu-data - entrypoint: "python /Zeeguu-API/start.py" - networks: - - zeeguu_backend - depends_on: - - elasticsearch_v8 - - embedding_api - - readability_server - mem_limit: 2048m - - readability_server: - image: aecrimus/readability_server - ports: - - 3456:3456 - networks: - - zeeguu_backend - restart: unless-stopped - mem_limit: 1024m - - elasticsearch_v8: - image: elasticsearch:8.12.2 - platform: linux/amd64 - ports: - - 9200:9200 - - 9300:9300 - environment: - - discovery.type=single-node - - xpack.security.enabled=false - volumes: - - ./data/elasticsearch_db_v8/data:/usr/share/elasticsearch/data - networks: - - zeeguu_backend - restart: unless-stopped - mem_limit: 2048m - - - dev_play: - image: zeeguu_api_dev - environment: - ZEEGUU_CONFIG: /Zeeguu-API/default_docker.cfg - volumes: - - .:/Zeeguu-API - - ./data/zeeguu:/zeeguu-data - entrypoint: "python tools/_playground.py" - depends_on: - - elasticsearch_v8 - - embedding_api - - readability_server - networks: - - zeeguu_backend - - # docker-compose run --rm dev_bash - dev_bash: - image: zeeguu_api_dev - stdin_open: true # docker run -i - tty: true # docker run -t - - environment: - ZEEGUU_CONFIG: /Zeeguu-API/default_docker.cfg - - volumes: - - .:/Zeeguu-API - - ./data/zeeguu:/zeeguu-data - entrypoint: "bash" - networks: - - zeeguu_backend - - dev_test: - image: zeeguu_api_dev - environment: - ZEEGUU_CONFIG: /Zeeguu-API/default_docker.cfg - - volumes: - - .:/Zeeguu-API - - ./data/zeeguu:/zeeguu-data - entrypoint: "./run_tests.sh" - networks: - - zeeguu_backend - - dev_init_es: - image: zeeguu_api_dev - environment: - ZEEGUU_CONFIG: /Zeeguu-API/default_docker.cfg - ZEEGUU_ES_CONN_STRING: "http://elasticsearch_v8:9200" - ZEEGUU_EMB_API_CONN_STRING: "http://embedding_api:3654" - volumes: - - .:/Zeeguu-API - - ./data/zeeguu/esV8:/zeeguu-data - - ./Users:/userslalal - entrypoint: "python tools/mysql_to_elastic_new_topics.py" - networks: - - zeeguu_backend - depends_on: - - elasticsearch_v8 - - embedding_api - - embedding_api: - image: zeeguu_api_sem_emb - environment: - SEMANTIC_EMB_API_PORT: 3654 - ports: - - 3654:3654 - entrypoint: "python ./semantic-emb-api/app/app.py" - volumes: - - .:/Zeeguu-API - - ./data/zeeguu/language-models:/semantic-emb-api/semantic-emb-api/app/semantic_vector/binaries - networks: - - zeeguu_backend - - dev_server_pre: - image: zeeguu_api_dev - environment: - ZEEGUU_CONFIG: /Zeeguu-API/default_docker_v8.cfg - ZEEGUU_ES_CONN_STRING: "http://elasticsearch_v8_pre:9205" - ZEEGUU_EMB_API_CONN_STRING: "http://embedding_api:3654" - PYTHONUNBUFFERED: 1 - MICROSOFT_TRANSLATE_API_KEY: ${MICROSOFT_TRANSLATE_API_KEY} - GOOGLE_TRANSLATE_API_KEY: ${GOOGLE_TRANSLATE_API_KEY} - ports: - - 9005:9005 - volumes: - - .:/Zeeguu-API - - ./data/zeeguu/esV8:/zeeguu-data - entrypoint: "python /Zeeguu-API/start.py" - networks: - - zeeguu_backend - depends_on: - - elasticsearch_v8_pre - - embedding_api - - readability_server - mem_limit: 2048m - - elasticsearch_v8_pre: - image: elasticsearch:8.12.2 - platform: linux/amd64 - ports: - - 9205:9205 - - 9305:9305 - environment: - - discovery.type=single-node - - xpack.security.enabled=false - - http.port=9205 - volumes: - - ./data/elasticsearch_db_v8/data:/usr/share/elasticsearch/data - networks: - - zeeguu_backend - restart: unless-stopped - mem_limit: 2048m - - -networks: - zeeguu_backend: From 316d610a9a3631395e74ab2985d1effa87113430 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Mon, 18 Nov 2024 09:52:22 +0100 Subject: [PATCH 68/71] Update docker-compose.yml - Removed ES7 service, as it is no longer used. --- docker-compose.yml | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 0c01b9a9..dfe2539e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -47,19 +47,6 @@ services: restart: unless-stopped mem_limit: 512m # Useful to restrict the ammount of RAM used by ES. - elasticsearch: - image: elasticsearch:7.6.2 - environment: - - discovery.type=single-node - - network.host=0.0.0.0 # is this still needed? - volumes: - - ${ZEEGUU_DATA_FOLDER}/elasticsearch/data:/usr/share/elasticsearch/data - networks: - - zeeguu_backend - restart: unless-stopped - mem_limit: 512m - - readability_server: image: zeeguu/readability_server networks: From f3c24b1c1e4c1ad78c1490a04f955853a76c4879 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Mon, 18 Nov 2024 13:50:03 +0100 Subject: [PATCH 69/71] Update top_bookmarks_for_user.py --- zeeguu/core/bookmark_quality/top_bookmarks_for_user.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zeeguu/core/bookmark_quality/top_bookmarks_for_user.py b/zeeguu/core/bookmark_quality/top_bookmarks_for_user.py index 9be6daf1..e53b1513 100644 --- a/zeeguu/core/bookmark_quality/top_bookmarks_for_user.py +++ b/zeeguu/core/bookmark_quality/top_bookmarks_for_user.py @@ -14,7 +14,7 @@ def rank(b): query.join(UserWord, Bookmark.origin_id == UserWord.id) .filter(UserWord.language_id == self.learned_language_id) .filter(Bookmark.user_id == self.id) - .filter(Bookmark.learned_time != None) + .filter(Bookmark.learned_time == None) .order_by(Bookmark.time.desc()) .limit(400) ) From d42a28d2beb862d5c24da5644253878969218b2d Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Thu, 21 Nov 2024 13:08:18 +0100 Subject: [PATCH 70/71] Fix report generation - Report topic generation method got overridden when changing the topic names. --- tools/report_generator/data_extractor.py | 2 +- tools/report_generator/generate_report.py | 65 +++++++++++------------ 2 files changed, 33 insertions(+), 34 deletions(-) diff --git a/tools/report_generator/data_extractor.py b/tools/report_generator/data_extractor.py index be11e7e4..93a2b3b4 100644 --- a/tools/report_generator/data_extractor.py +++ b/tools/report_generator/data_extractor.py @@ -24,7 +24,7 @@ def run_query(self, query): return df def get_article_topics_df(self, feed_df): - print("Getting Article New Topics...") + print("Getting Article Topics...") query = f"""SELECT a.id, l.name Language, a.feed_id, t.title Topic, atm.origin_type FROM article a INNER JOIN article_topic_map atm on a.id = atm.article_id diff --git a/tools/report_generator/generate_report.py b/tools/report_generator/generate_report.py index f9d6566e..e7aabdf3 100644 --- a/tools/report_generator/generate_report.py +++ b/tools/report_generator/generate_report.py @@ -1,6 +1,6 @@ import pandas as pd import matplotlib.pyplot as plt -from matplotlib import rcParams +from zeeguu.core.model.article_topic_map import TopicOriginType from tools.crawl_summary.crawl_report import CrawlReport import seaborn as sns from data_extractor import DataExtractor @@ -194,51 +194,50 @@ def generate_topic_by_feed_plot(article_topic_df, lang): def generate_topic_coverage_plot(article_df, article_with_topics_df): filename = f"topic_coverage_plot_{date_str}_d{DAYS_FOR_REPORT}.png" - article_df["has_topic"] = "No" - article_df.loc[article_df.id.isin(article_with_topics_df.id), "has_topic"] = "Yes" - articles_with_topics = ( - article_df.groupby("Language") - .has_topic.value_counts(normalize=True) - .reset_index() - ) - sns.barplot( - x="Language", - y="proportion", - hue="has_topic", - data=articles_with_topics, - palette={ - "Yes": sns.color_palette("vlag")[0], - "No": sns.color_palette("vlag")[5], - }, - ) - plt.title("Proportion of Articles with Topics") - plt.xticks(rotation=35, ha="right") - return save_fig_params(filename) - - -def generate_topic_coverage_plot(article_df, article_with_topics_df): - filename = f"topic_coverage_plot_{date_str}_d{DAYS_FOR_REPORT}.png" - article_df["has_topic"] = "No" - article_df.loc[article_df.id.isin(article_with_topics_df.id), "has_topic"] = "Yes" + article_df["Has Topic"] = "No Topic" + article_df.loc[ + article_df.id.isin( + article_with_topics_df.loc[ + article_with_topics_df.origin_type == TopicOriginType.HARDSET, "id" + ] + ), + "Has Topic", + ] = "Hardset Topic" + article_df.loc[ + article_df.id.isin( + article_with_topics_df.loc[ + article_with_topics_df.origin_type == TopicOriginType.URL_PARSED, "id" + ] + ), + "Has Topic", + ] = "Url Keyword Topic" + article_df.loc[ + article_df.id.isin( + article_with_topics_df.loc[ + article_with_topics_df.origin_type == TopicOriginType.INFERRED, "id" + ] + ), + "Has Topic", + ] = "Inferred Topic" articles_with_topics = ( article_df.groupby("Language")["Has Topic"] .value_counts(normalize=True) .reset_index() ) - + color_pallete = sns.color_palette("vlag", n_colors=15) sns.barplot( x="Language", y="proportion", hue="Has Topic", data=articles_with_topics, palette={ - "Inferred Topic": sns.color_palette("vlag")[2], - "Url Keyword Topic": sns.color_palette("vlag")[0], - "Hardset Topic": sns.color_palette("vlag")[1], - "No Topic": sns.color_palette("vlag")[5], + "Url Keyword Topic": color_pallete[0], + "Hardset Topic": color_pallete[1], + "Inferred Topic": color_pallete[3], + "No Topic": color_pallete[len(color_pallete) - 1], }, ) - plt.title("Proportion of Articles with New Topics") + plt.title("Proportion of Articles with Topics") plt.xticks(rotation=35, ha="right") return save_fig_params(filename) From 01c505fd779930e75d2a901dcb737dbe78503181 Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Thu, 21 Nov 2024 16:29:47 +0100 Subject: [PATCH 71/71] more human readable updates for exercise sessions --- requirements.txt | 1 + zeeguu/core/emailer/user_activity.py | 14 ++++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 470129c4..d7445d9c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ elasticsearch==8.12.1 elasticsearch-dsl==8.12.0 Faker feedparser +human-readable flask==2.3.2 werkzeug==3.0.2 Flask-Assets diff --git a/zeeguu/core/emailer/user_activity.py b/zeeguu/core/emailer/user_activity.py index de83bb2c..bf3befea 100644 --- a/zeeguu/core/emailer/user_activity.py +++ b/zeeguu/core/emailer/user_activity.py @@ -1,6 +1,8 @@ from zeeguu.core.model import User from zeeguu.core.emailer.zeeguu_mailer import ZeeguuMailer from zeeguu.core.model.user_activitiy_data import UserActivityData +import human_readable +import datetime cheers_your_server = "\n\rCheers,\n\rYour Zeeguu Server ;)" @@ -13,19 +15,23 @@ def send_new_user_account_email(username, invite_code="", cohort=""): def send_user_finished_exercise_session(exercise_session): + details = exercise_session.exercises_in_session_string() user = exercise_session.user - main_body = f"User: {user.name} ({user.id}) Duration: {exercise_session.duration / 1000} \n\n" + hr_duration = human_readable.precise_delta( + datetime.timedelta(seconds=exercise_session.duration / 1000), + minimum_unit="microseconds", + ) + main_body = f"User: {user.name} ({user.id}) Duration: {hr_duration} \n\n" main_body += f"
{details}
" ZeeguuMailer.send_mail( f"{exercise_session.user.name}: Finished Exercise Session", - [main_body, - cheers_your_server], + [main_body, cheers_your_server], ) def send_notification_article_feedback( - feedback, user: User, article_title, article_url, article_id + feedback, user: User, article_title, article_url, article_id ): from datetime import datetime as dt