diff --git a/Pipfile b/Pipfile index 19c456f..46c70f7 100644 --- a/Pipfile +++ b/Pipfile @@ -21,7 +21,6 @@ schedule = "*" feedparser = "*" numpy = "*" pandas = "*" -lightfm = "*" langchain = "*" chromadb = "*" langchain-community = "*" @@ -31,6 +30,7 @@ tiktoken = "*" langchain-openai = "*" langchain-google-community = "*" wikipedia-api = "*" +plotly = "*" [dev-packages] flake8 = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 0ba1424..4073b9a 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "dced8354c32161ea8c87ca34ad9d1c148e9e6383483a94beef2ee1851f65672c" + "sha256": "fcbb46438648285b6c3ef92c5dc51e8fa5a0a7e1ce3bf3bf954770c6e84ef93f" }, "pipfile-spec": 6, "requires": { @@ -155,7 +155,7 @@ "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f", "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028" ], - "markers": "python_version < '3.12.0'", + "markers": "python_full_version < '3.12.0'", "version": "==4.0.3" }, "asyncpg": { @@ -911,14 +911,6 @@ "markers": "python_version >= '3.7'", "version": "==3.1.4" }, - "joblib": { - "hashes": [ - "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6", - "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e" - ], - "markers": "python_version >= '3.8'", - "version": "==1.4.2" - }, "jsonpatch": { "hashes": [ "sha256:0ae28c0cd062bbd8b8ecc26d7d164fbbea9652a1a3693f3b956c1eae5145dade", @@ -949,7 +941,7 @@ "sha256:98e79e0b9a60a9c740b44d5b0135c85f649219308f30d373cf5f10d0efe18b87" ], "index": "pypi", - "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", + "markers": "python_version < '4.0' and python_full_version >= '3.8.1'", "version": "==0.2.7" }, "langchain-community": { @@ -958,7 +950,7 @@ "sha256:f52659dbb8a8f0c011cc7d634247686d11d768843cc3e5fe9e6f52321cde82c0" ], "index": "pypi", - "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", + "markers": "python_version < '4.0' and python_full_version >= '3.8.1'", "version": "==0.2.7" }, "langchain-core": { @@ -966,7 +958,7 @@ "sha256:c9dbb197508e76337ed810ec977d40ae0c896397d191b420ef126c3818a1be96", "sha256:ca5c5f1a783449dae8686e366ff3c5b775f8b5cef0de4ef346b8820d3d1c46ff" ], - "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", + "markers": "python_version < '4.0' and python_full_version >= '3.8.1'", "version": "==0.2.18" }, "langchain-google-community": { @@ -975,7 +967,7 @@ "sha256:5c2f18aebfb60b51f68dc1608d09ac3e8231743d75c545f409230f7bee641faf" ], "index": "pypi", - "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", + "markers": "python_version < '4.0' and python_full_version >= '3.8.1'", "version": "==1.0.6" }, "langchain-openai": { @@ -984,7 +976,7 @@ "sha256:bff90e9d0be786495920a7851ae4d55247fb084d3a11d1b15bfe91904ce1cb0f" ], "index": "pypi", - "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", + "markers": "python_version < '4.0' and python_full_version >= '3.8.1'", "version": "==0.1.16" }, "langchain-text-splitters": { @@ -992,7 +984,7 @@ "sha256:1c80d4b11b55e2995f02d2a326c0323ee1eeff24507329bb22924e420c782dff", "sha256:a1e45de10919fa6fb080ef0525deab56557e9552083600455cb9fa4238076140" ], - "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", + "markers": "python_version < '4.0' and python_full_version >= '3.8.1'", "version": "==0.2.2" }, "langsmith": { @@ -1000,16 +992,9 @@ "sha256:acff31f9e53efa48586cf8e32f65625a335c74d7c4fa306d1655ac18452296f6", "sha256:c1f94384f10cea96f7b4d33fd3db7ec180c03c7468877d50846f881d2017ff94" ], - "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", + "markers": "python_version < '4.0' and python_full_version >= '3.8.1'", "version": "==0.1.85" }, - "lightfm": { - "hashes": [ - "sha256:2b77ada182ccd768a8d7643ab3cfcd8b6e855db09087f7cc7329bd63316697a8" - ], - "index": "pypi", - "version": "==1.17" - }, "loguru": { "hashes": [ "sha256:003d71e3d3ed35f0f8984898359d65b79e5b21943f78af86aa5491210429b8eb", @@ -1591,6 +1576,15 @@ "markers": "python_version >= '3.9'", "version": "==2.2.2" }, + "plotly": { + "hashes": [ + "sha256:68fc1901f098daeb233cc3dd44ec9dc31fb3ca4f4e53189344199c43496ed006", + "sha256:859fdadbd86b5770ae2466e542b761b247d1c6b49daed765b95bb8c7063e7469" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==5.22.0" + }, "posthog": { "hashes": [ "sha256:3c672be7ba6f95d555ea207d4486c171d06657eb34b3ce25eb043bfe7b6b5b76", @@ -1755,7 +1749,7 @@ "sha256:a1bac0ce561155ecc3ed78ca94d3c9378656ad4c94c1270de543f621420f94ad", "sha256:f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742" ], - "markers": "python_version > '3.0'", + "markers": "python_version >= '3.1'", "version": "==3.1.2" }, "pypika": { @@ -1777,7 +1771,7 @@ "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.9.0.post0" }, "python-dotenv": { @@ -1986,64 +1980,6 @@ "markers": "python_version >= '3.7'", "version": "==1.2.2" }, - "scikit-learn": { - "hashes": [ - "sha256:0828673c5b520e879f2af6a9e99eee0eefea69a2188be1ca68a6121b809055c1", - "sha256:0ea5d40c0e3951df445721927448755d3fe1d80833b0b7308ebff5d2a45e6414", - "sha256:10e49170691514a94bb2e03787aa921b82dbc507a4ea1f20fd95557862c98dc1", - "sha256:154297ee43c0b83af12464adeab378dee2d0a700ccd03979e2b821e7dd7cc1c2", - "sha256:161808750c267b77b4a9603cf9c93579c7a74ba8486b1336034c2f1579546d21", - "sha256:1bd8d3a19d4bd6dc5a7d4f358c8c3a60934dc058f363c34c0ac1e9e12a31421d", - "sha256:1ff4ba34c2abff5ec59c803ed1d97d61b036f659a17f55be102679e88f926fac", - "sha256:508907e5f81390e16d754e8815f7497e52139162fd69c4fdbd2dfa5d6cc88915", - "sha256:5944ce1faada31c55fb2ba20a5346b88e36811aab504ccafb9f0339e9f780395", - "sha256:5f57428de0c900a98389c4a433d4a3cf89de979b3aa24d1c1d251802aa15e44d", - "sha256:689b6f74b2c880276e365fe84fe4f1befd6a774f016339c65655eaff12e10cbf", - "sha256:781586c414f8cc58e71da4f3d7af311e0505a683e112f2f62919e3019abd3745", - "sha256:7b073a27797a283187a4ef4ee149959defc350b46cbf63a84d8514fe16b69855", - "sha256:88e0672c7ac21eb149d409c74cc29f1d611d5158175846e7a9c2427bd12b3956", - "sha256:909144d50f367a513cee6090873ae582dba019cb3fca063b38054fa42704c3a4", - "sha256:97625f217c5c0c5d0505fa2af28ae424bd37949bb2f16ace3ff5f2f81fb4498b", - "sha256:9a07f90846313a7639af6a019d849ff72baadfa4c74c778821ae0fad07b7275b", - "sha256:b59e3e62d2be870e5c74af4e793293753565c7383ae82943b83383fdcf5cc5c1", - "sha256:b5e865e9bd59396220de49cb4a57b17016256637c61b4c5cc81aaf16bc123bbe", - "sha256:da3f404e9e284d2b0a157e1b56b6566a34eb2798205cba35a211df3296ab7a74", - "sha256:f5b213bc29cc30a89a3130393b0e39c847a15d769d6e59539cd86b75d276b1a7" - ], - "markers": "python_version >= '3.9'", - "version": "==1.5.1" - }, - "scipy": { - "hashes": [ - "sha256:076c27284c768b84a45dcf2e914d4000aac537da74236a0d45d82c6fa4b7b3c0", - "sha256:07e179dc0205a50721022344fb85074f772eadbda1e1b3eecdc483f8033709b7", - "sha256:176c6f0d0470a32f1b2efaf40c3d37a24876cebf447498a4cefb947a79c21e9d", - "sha256:42470ea0195336df319741e230626b6225a740fd9dce9642ca13e98f667047c0", - "sha256:4c4161597c75043f7154238ef419c29a64ac4a7c889d588ea77690ac4d0d9b20", - "sha256:5b083c8940028bb7e0b4172acafda6df762da1927b9091f9611b0bcd8676f2bc", - "sha256:64b2ff514a98cf2bb734a9f90d32dc89dc6ad4a4a36a312cd0d6327170339eb0", - "sha256:65df4da3c12a2bb9ad52b86b4dcf46813e869afb006e58be0f516bc370165159", - "sha256:687af0a35462402dd851726295c1a5ae5f987bd6e9026f52e9505994e2f84ef6", - "sha256:6a9c9a9b226d9a21e0a208bdb024c3982932e43811b62d202aaf1bb59af264b1", - "sha256:6d056a8709ccda6cf36cdd2eac597d13bc03dba38360f418560a93050c76a16e", - "sha256:7d3da42fbbbb860211a811782504f38ae7aaec9de8764a9bef6b262de7a2b50f", - "sha256:7e911933d54ead4d557c02402710c2396529540b81dd554fc1ba270eb7308484", - "sha256:94c164a9e2498e68308e6e148646e486d979f7fcdb8b4cf34b5441894bdb9caf", - "sha256:9e3154691b9f7ed73778d746da2df67a19d046a6c8087c8b385bc4cdb2cfca74", - "sha256:9eee2989868e274aae26125345584254d97c56194c072ed96cb433f32f692ed8", - "sha256:a01cc03bcdc777c9da3cfdcc74b5a75caffb48a6c39c8450a9a05f82c4250a14", - "sha256:a7d46c3e0aea5c064e734c3eac5cf9eb1f8c4ceee756262f2c7327c4c2691c86", - "sha256:ad36af9626d27a4326c8e884917b7ec321d8a1841cd6dacc67d2a9e90c2f0359", - "sha256:b5923f48cb840380f9854339176ef21763118a7300a88203ccd0bdd26e58527b", - "sha256:bbc0471b5f22c11c389075d091d3885693fd3f5e9a54ce051b46308bc787e5d4", - "sha256:bff2438ea1330e06e53c424893ec0072640dac00f29c6a43a575cbae4c99b2b9", - "sha256:c40003d880f39c11c1edbae8144e3813904b10514cd3d3d00c277ae996488cdb", - "sha256:d91db2c41dd6c20646af280355d41dfa1ec7eead235642178bd57635a3f82209", - "sha256:f0a50da861a7ec4573b7c716b2ebdcdf142b66b756a0d392c236ae568b3a93fb" - ], - "markers": "python_version >= '3.10'", - "version": "==1.14.0" - }, "setuptools": { "hashes": [ "sha256:f171bab1dfbc86b132997f26a119f6056a57950d058587841a0082e8830f9dc5", @@ -2071,7 +2007,7 @@ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.16.0" }, "sniffio": { @@ -2170,14 +2106,6 @@ "markers": "python_version >= '3.8'", "version": "==8.3.0" }, - "threadpoolctl": { - "hashes": [ - "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107", - "sha256:56c1e26c150397e58c4926da8eeee87533b1e32bef131bd4bf6a2f45f3185467" - ], - "markers": "python_version >= '3.8'", - "version": "==3.5.0" - }, "tiktoken": { "hashes": [ "sha256:03c6c40ff1db0f48a7b4d2dafeae73a5607aacb472fa11f125e7baf9dce73704", diff --git a/app/model/article_model.py b/app/model/article_model.py index bc1cfbd..2a09594 100644 --- a/app/model/article_model.py +++ b/app/model/article_model.py @@ -4,3 +4,5 @@ class ArticleResponse(BaseModel): title: str content: str + pub_date: str + image_url: str diff --git a/app/model/crawled_article.py b/app/model/crawled_article.py index 2f42f11..d174537 100644 --- a/app/model/crawled_article.py +++ b/app/model/crawled_article.py @@ -22,6 +22,8 @@ class Articles(Base): phrase = Column(JSONB, nullable=True) comment = Column(Text, nullable=True) category = Column(CHAR(255), nullable=True) + published_at = Column(DateTime, nullable=True) + image_url = Column(String, nullable=True) probability_issue_finder = Column(Integer, nullable=True) probability_lifestyle_consumer = Column(Integer, nullable=True) probability_entertainer = Column(Integer, nullable=True) diff --git a/app/model/user_type.py b/app/model/user_type.py index bdd15af..3efc2b9 100644 --- a/app/model/user_type.py +++ b/app/model/user_type.py @@ -7,6 +7,7 @@ class UserType(Base): __tablename__ = "user_type" + __table_args__ = {"schema": "gyeongdan"} user_id = Column(BigInteger, primary_key=True, index=True) user_type_issue_finder = Column(Integer, nullable=True) @@ -17,9 +18,9 @@ class UserType(Base): class UserTypes(Enum): - NONE: -1 - ISSUE_FINDER: 0 - LIFESTYLE_CONSUMER: 1 - ENTERTAINER: 2 - TECH_SEPCIALIST: 3 - PROFESSIONALS: 4 + NONE= -1 + ISSUE_FINDER= 0 + LIFESTYLE_CONSUMER= 1 + ENTERTAINER= 2 + TECH_SEPCIALIST= 3 + PROFESSIONALS= 4 diff --git a/app/recommend/recommend_service.py b/app/recommend/recommend_service.py index d0f9a9f..d2b391e 100644 --- a/app/recommend/recommend_service.py +++ b/app/recommend/recommend_service.py @@ -1,243 +1,243 @@ -# pylint: disable=missing-module-docstring, missing-module-docstring, attribute-defined-outside-init, unnecessary-comprehension, not-callable, consider-using-f-string, unused-variable - -import asyncio -import os -import warnings -from datetime import datetime - -import numpy as np -import pandas as pd -from fastapi import Depends -from sqlalchemy.ext.asyncio import AsyncSession - -from app.database.repository import model_to_dict -from app.database.session import get_db_session -from app.model.crawled_article import Articles -from app.repository.crawled_article_crud import CrawledArticleRepository -from app.service.article_manage_service import ArticleManageService -from app.repository.interaction_crud import InteractionRepository -from app.model.interaction import Interaction -from lightfm import LightFM -from lightfm.data import Dataset # pylint: disable=E0611 - -warnings.filterwarnings("ignore") - - -def articles_to_dataframe(articles: list[Articles]) -> pd.DataFrame: - # 객체 리스트를 딕셔너리 리스트로 변환 - articles_dict_list = [ - { - "article_id": article.id, - 'ECONOMY_AND_BUSINESS': 0, - 'POLITICS_AND_SOCIETY': 0, - 'SPORTS_AND_LEISURE': 0, - 'TECHNOLOGY_AND_CULTURE': 0 - # "created_at": article.created_at.strftime('%Y-%m-%d'), - } - for article in articles - ] - for i in range(len(articles_dict_list)): - articles_dict_list[i][articles[i].category] = 1 - - df = pd.DataFrame(articles_dict_list) - return df - -def interaction_to_dataframe(interactions : list[Interaction]) -> pd.DataFrame: - interaction_dict_list = [ - { - "classification_id": interaction.classification_id, - "article_id": interaction.article_id, - "duration_time": interaction.duration_time - } - for interaction in interactions - ] - df = pd.DataFrame(interaction_dict_list) - return df - -class ArticleDataInfo: - def __init__(self, article_id, category, created_at): - self.article_data = pd.DataFrame( - { - "article_id": article_id, - "경제 및 기업": [0], - "정치 및 사회": [0], - "기술 및 문화": [0], - "스포츠 및 여가": [0], - "오피니언 및 분석": [0], - # "created_at": [created_at], - } - ) - - self.article_data.iloc[0][category] = 1 - - -class InteractionDataInfo: - def __init__(self, user_id, article_id, duration_time): - self.interaction_data = pd.DataFrame( - { - "classification_id": [user_id], - "article_id": [article_id], - "duration_time": [duration_time], - } - ) - - -class RecommendService: - # pylint: disable=too-many-instance-attributes - - def __init__(self): - self.interaction_datas = None - self.num_classification = 5 - - def set_user_datas(self, user_data_path): - self.user_data_path = user_data_path - self.user_datas = pd.read_csv(os.path.dirname(os.path.abspath(__file__)) + user_data_path) - - - async def initialize_data(self, session): - self.set_user_datas("/./user_classification.csv") - await self.set_article_datas(session) - await self.set_interaction_datas(session) - - async def set_article_datas(self, session): - # session = Depends(get_db_session) - articles = await ArticleManageService().get_all_articles(session=session) - self.article_datas = pd.get_dummies(articles_to_dataframe(articles)) - - async def set_interaction_datas(self, session): - # session = Depends(get_db_session) - interactions = await InteractionRepository().get_all(session=session) - self.interaction_datas = interaction_to_dataframe(interactions) - print(self.interaction_datas.columns) - - def make_dataset(self): - self.user_datas = pd.get_dummies(self.user_datas) - self.user_features_col = self.user_datas.drop( - columns=["classification_id"] - ).columns.values - self.user_feat = self.user_datas.drop(columns=["classification_id"]).to_dict( - orient="records" - ) - - self.item_features = self.article_datas - self.item_features_col = self.item_features.drop( - columns=["article_id"] - ).columns.values - self.item_feat = self.item_features.drop( - columns=["article_id"] - ).to_dict(orient="records") - - self.dataset = Dataset() - self.dataset.fit( - users=[x for x in self.user_datas["classification_id"]], - items=[x for x in self.article_datas["article_id"]], - item_features=self.item_features_col, - user_features=self.user_features_col, - ) - - print(self.item_feat) - self.item_features = self.dataset.build_item_features( - (x, y) for x, y in zip(self.item_features["article_id"], self.item_feat) - ) - self.user_features = self.dataset.build_user_features( - (x, y) for x, y in zip(self.user_datas["classification_id"], self.user_feat) - ) - - (self.interactions, self.weights) = self.dataset.build_interactions( - (x, y, z) - for x, y, z in zip( - self.interaction_datas["classification_id"], - self.interaction_datas["article_id"], - self.interaction_datas["duration_time"], - ) - ) - - num_users, num_items = self.dataset.interactions_shape() - print("Num users: {}, num_items {}.".format(num_users, num_items)) - - def make_model( - self, - n_components: int = 30, - loss: str = "warp", - epoch: int = 30, - num_thread: int = 4, - ): - self.n_components = n_components - self.loss = loss - self.epoch = epoch - self.num_thread = num_thread - self.model = LightFM( - no_components=self.n_components, loss=self.loss, random_state=1616 - ) - - def fit_model(self): - self.make_dataset() - self.make_model() - self.model.fit( - self.interactions, - user_features=self.user_features, - item_features=self.item_features, - epochs=self.epoch, - num_threads=self.num_thread, - sample_weight=self.weights, - ) - - def get_top_n_articles(self, user_id: int, article_num: int): - item_ids = np.arange(self.interactions.shape[1]) # 예측할 아이템 ID 배열 - - predictions = self.model.predict(user_id, item_ids) - top_items = self.article_datas.iloc[np.argsort(-predictions)[:article_num]] - return top_items - - def similar_items(self, item_id, N=10): - item_bias, item_representations = self.model.get_item_representations( - features=self.item_features - ) - - scores = item_representations.dot(item_representations[item_id, :]) - best = np.argpartition(scores, -N)[-N:] - - return self.article_datas.iloc[best] - - async def get_classification_for_article(self, article_id:id, session:AsyncSession): - scores = self.model.predict(np.arange(len(self.user_datas)), np.full(len(self.user_datas), article_id)) - top_users = np.argsort(-scores) - - score_for_classification = [0 for _ in range(self.num_classification)] - weight = 10 - for user_id in top_users[:10]: - for i in range(self.num_classification): - score_for_classification[i] += self.user_datas.iloc[user_id][self.user_datas.columns[i+2]] * (2 ** weight) - weight -= 1 - - total = sum(score_for_classification) - for i in range(self.num_classification): - score_for_classification[i] = (int)(score_for_classification[i] / (total/100)) - - await CrawledArticleRepository().set_interest_type(article_id, score_for_classification, session) - - return score_for_classification - - def get_time_weight(self, article_id): - today = datetime.now().date() - date_obj = datetime.strptime( - self.article_datas[self.article_datas["article_id"] == article_id][ - "created_at" - ].iloc[0], - "%Y-%m-%d", - ).date() - difference = today - date_obj - return max(1 - ((difference.days // 30) / 5), 0) - - def fit_model_partialy(self): - self.make_dataset() - self.model.fit_partial(self.interactions, item_features=self.item_features) - - def add_interaction_data(self, interaction_data: InteractionDataInfo): - InteractionRepository().create( - Interaction( - classification_id=interaction_data.interaction_data['classification_id'], - article_id=interaction_data.interaction_data['article_id'], - duration_time=interaction_data.interaction_data['duration_time'] - ) - ) \ No newline at end of file +# # pylint: disable=missing-module-docstring, missing-module-docstring, attribute-defined-outside-init, unnecessary-comprehension, not-callable, consider-using-f-string, unused-variable +# +# import asyncio +# import os +# import warnings +# from datetime import datetime +# +# import numpy as np +# import pandas as pd +# from fastapi import Depends +# from sqlalchemy.ext.asyncio import AsyncSession +# +# from app.database.repository import model_to_dict +# from app.database.session import get_db_session +# from app.model.crawled_article import Articles +# from app.repository.crawled_article_crud import CrawledArticleRepository +# from app.service.article_manage_service import ArticleManageService +# from app.repository.interaction_crud import InteractionRepository +# from app.model.interaction import Interaction +# from lightfm import LightFM +# from lightfm.data import Dataset # pylint: disable=E0611 +# +# warnings.filterwarnings("ignore") +# +# +# def articles_to_dataframe(articles: list[Articles]) -> pd.DataFrame: +# # 객체 리스트를 딕셔너리 리스트로 변환 +# articles_dict_list = [ +# { +# "article_id": article.id, +# 'ECONOMY_AND_BUSINESS': 0, +# 'POLITICS_AND_SOCIETY': 0, +# 'SPORTS_AND_LEISURE': 0, +# 'TECHNOLOGY_AND_CULTURE': 0 +# # "created_at": article.created_at.strftime('%Y-%m-%d'), +# } +# for article in articles +# ] +# for i in range(len(articles_dict_list)): +# articles_dict_list[i][articles[i].category] = 1 +# +# df = pd.DataFrame(articles_dict_list) +# return df +# +# def interaction_to_dataframe(interactions : list[Interaction]) -> pd.DataFrame: +# interaction_dict_list = [ +# { +# "classification_id": interaction.classification_id, +# "article_id": interaction.article_id, +# "duration_time": interaction.duration_time +# } +# for interaction in interactions +# ] +# df = pd.DataFrame(interaction_dict_list) +# return df +# +# class ArticleDataInfo: +# def __init__(self, article_id, category, created_at): +# self.article_data = pd.DataFrame( +# { +# "article_id": article_id, +# "경제 및 기업": [0], +# "정치 및 사회": [0], +# "기술 및 문화": [0], +# "스포츠 및 여가": [0], +# "오피니언 및 분석": [0], +# # "created_at": [created_at], +# } +# ) +# +# self.article_data.iloc[0][category] = 1 +# +# +# class InteractionDataInfo: +# def __init__(self, user_id, article_id, duration_time): +# self.interaction_data = pd.DataFrame( +# { +# "classification_id": [user_id], +# "article_id": [article_id], +# "duration_time": [duration_time], +# } +# ) +# +# +# class RecommendService: +# # pylint: disable=too-many-instance-attributes +# +# def __init__(self): +# self.interaction_datas = None +# self.num_classification = 5 +# +# def set_user_datas(self, user_data_path): +# self.user_data_path = user_data_path +# self.user_datas = pd.read_csv(os.path.dirname(os.path.abspath(__file__)) + user_data_path) +# +# +# async def initialize_data(self, session): +# self.set_user_datas("/./user_classification.csv") +# await self.set_article_datas(session) +# await self.set_interaction_datas(session) +# +# async def set_article_datas(self, session): +# # session = Depends(get_db_session) +# articles = await ArticleManageService().get_all_articles(session=session) +# self.article_datas = pd.get_dummies(articles_to_dataframe(articles)) +# +# async def set_interaction_datas(self, session): +# # session = Depends(get_db_session) +# interactions = await InteractionRepository().get_all(session=session) +# self.interaction_datas = interaction_to_dataframe(interactions) +# print(self.interaction_datas.columns) +# +# def make_dataset(self): +# self.user_datas = pd.get_dummies(self.user_datas) +# self.user_features_col = self.user_datas.drop( +# columns=["classification_id"] +# ).columns.values +# self.user_feat = self.user_datas.drop(columns=["classification_id"]).to_dict( +# orient="records" +# ) +# +# self.item_features = self.article_datas +# self.item_features_col = self.item_features.drop( +# columns=["article_id"] +# ).columns.values +# self.item_feat = self.item_features.drop( +# columns=["article_id"] +# ).to_dict(orient="records") +# +# self.dataset = Dataset() +# self.dataset.fit( +# users=[x for x in self.user_datas["classification_id"]], +# items=[x for x in self.article_datas["article_id"]], +# item_features=self.item_features_col, +# user_features=self.user_features_col, +# ) +# +# print(self.item_feat) +# self.item_features = self.dataset.build_item_features( +# (x, y) for x, y in zip(self.item_features["article_id"], self.item_feat) +# ) +# self.user_features = self.dataset.build_user_features( +# (x, y) for x, y in zip(self.user_datas["classification_id"], self.user_feat) +# ) +# +# (self.interactions, self.weights) = self.dataset.build_interactions( +# (x, y, z) +# for x, y, z in zip( +# self.interaction_datas["classification_id"], +# self.interaction_datas["article_id"], +# self.interaction_datas["duration_time"], +# ) +# ) +# +# num_users, num_items = self.dataset.interactions_shape() +# print("Num users: {}, num_items {}.".format(num_users, num_items)) +# +# def make_model( +# self, +# n_components: int = 30, +# loss: str = "warp", +# epoch: int = 30, +# num_thread: int = 4, +# ): +# self.n_components = n_components +# self.loss = loss +# self.epoch = epoch +# self.num_thread = num_thread +# self.model = LightFM( +# no_components=self.n_components, loss=self.loss, random_state=1616 +# ) +# +# def fit_model(self): +# self.make_dataset() +# self.make_model() +# self.model.fit( +# self.interactions, +# user_features=self.user_features, +# item_features=self.item_features, +# epochs=self.epoch, +# num_threads=self.num_thread, +# sample_weight=self.weights, +# ) +# +# def get_top_n_articles(self, user_id: int, article_num: int): +# item_ids = np.arange(self.interactions.shape[1]) # 예측할 아이템 ID 배열 +# +# predictions = self.model.predict(user_id, item_ids) +# top_items = self.article_datas.iloc[np.argsort(-predictions)[:article_num]] +# return top_items +# +# def similar_items(self, item_id, N=10): +# item_bias, item_representations = self.model.get_item_representations( +# features=self.item_features +# ) +# +# scores = item_representations.dot(item_representations[item_id, :]) +# best = np.argpartition(scores, -N)[-N:] +# +# return self.article_datas.iloc[best] +# +# async def get_classification_for_article(self, article_id:id, session:AsyncSession): +# scores = self.model.predict(np.arange(len(self.user_datas)), np.full(len(self.user_datas), article_id)) +# top_users = np.argsort(-scores) +# +# score_for_classification = [0 for _ in range(self.num_classification)] +# weight = 10 +# for user_id in top_users[:10]: +# for i in range(self.num_classification): +# score_for_classification[i] += self.user_datas.iloc[user_id][self.user_datas.columns[i+2]] * (2 ** weight) +# weight -= 1 +# +# total = sum(score_for_classification) +# for i in range(self.num_classification): +# score_for_classification[i] = (int)(score_for_classification[i] / (total/100)) +# +# await CrawledArticleRepository().set_interest_type(article_id, score_for_classification, session) +# +# return score_for_classification +# +# def get_time_weight(self, article_id): +# today = datetime.now().date() +# date_obj = datetime.strptime( +# self.article_datas[self.article_datas["article_id"] == article_id][ +# "created_at" +# ].iloc[0], +# "%Y-%m-%d", +# ).date() +# difference = today - date_obj +# return max(1 - ((difference.days // 30) / 5), 0) +# +# def fit_model_partialy(self): +# self.make_dataset() +# self.model.fit_partial(self.interactions, item_features=self.item_features) +# +# def add_interaction_data(self, interaction_data: InteractionDataInfo): +# InteractionRepository().create( +# Interaction( +# classification_id=interaction_data.interaction_data['classification_id'], +# article_id=interaction_data.interaction_data['article_id'], +# duration_time=interaction_data.interaction_data['duration_time'] +# ) +# ) diff --git a/app/service/article_manage_service.py b/app/service/article_manage_service.py index 65deaab..c039497 100644 --- a/app/service/article_manage_service.py +++ b/app/service/article_manage_service.py @@ -1,3 +1,4 @@ +from datetime import datetime from typing import List from sqlalchemy.ext.asyncio import AsyncSession from app.model.article_publisher import Publisher @@ -16,6 +17,8 @@ async def create_article( simple_content: str, phrase: dict, comment: str, + image_url: str, + published_at: str, category: MailTypeCategory, session: AsyncSession, ) -> Articles: @@ -28,6 +31,8 @@ async def create_article( simple_title=simple_title, simple_content=simple_content, comment=comment, + published_at=datetime.strptime(published_at, '%Y-%m-%dT%H:%M:%S'), + image_url=image_url, category=category.name, phrase=phrase, probability_issue_finder=-1 diff --git a/app/service/crawl_article_service.py b/app/service/crawl_article_service.py index 462f3ff..bed4d49 100644 --- a/app/service/crawl_article_service.py +++ b/app/service/crawl_article_service.py @@ -1,5 +1,7 @@ import json import ssl +from datetime import datetime +from typing import List import aiohttp from aiohttp import ClientSession @@ -11,6 +13,13 @@ class CrawlArticleService: + def __init__(self): + self.__find_image_dict = { + Publisher.HAN_KYUNG.name: self.__find_image_han_kyung, + Publisher.MAE_KYUNG.name: self.__find_image_mae_kyung, + Publisher.SEOUL_KYUNG.name: self.__find_image_seoul_kyung + } + async def crawl_article(self, news_type: str, url: str) -> ArticleResponse: news_type = find_publisher(news_type) @@ -25,6 +34,8 @@ async def crawl_article(self, news_type: str, url: str) -> ArticleResponse: result_html = BeautifulSoup(response_text, "html.parser") title = self.__find_title(result_html, news_type) + pub_date = self.__find_pub_date(result_html, news_type) + image_url = self.__find_image(result_html, news_type) if news_type == Publisher.SEOUL_KYUNG: content = self.__find_content_from_script(result_html) else: @@ -34,7 +45,7 @@ async def crawl_article(self, news_type: str, url: str) -> ArticleResponse: if not content.strip(): raise HTTPException(status_code=404, detail="파싱 결과가 없습니다.") - return ArticleResponse(title=title, content=content) + return ArticleResponse(title=title, content=content, pub_date=pub_date, image_url=image_url) async def __fetch_page(self, url: str) -> str: ssl_context = ssl.create_default_context() @@ -44,6 +55,25 @@ async def __fetch_page(self, url: str) -> str: async with session.get(url, ssl=ssl_context) as response: return await response.text() + + def __find_image(self, soup: BeautifulSoup, news_type: Publisher): + return self.__find_image_dict[news_type.name](soup) + + + def __find_pub_date(self, soup: BeautifulSoup, news_type: Publisher): + property_str = '' + if news_type == Publisher.HAN_KYUNG: + property_str = 'article:published' + else: + property_str = "article:published_time" + pub_date_element = soup.find("meta", property=property_str) + pub_date = pub_date_element["content"] if pub_date_element else "pub date not found" + + if news_type == Publisher.HAN_KYUNG: + date_obj = datetime.fromisoformat(pub_date) + pub_date = date_obj.isoformat() + return pub_date + def __find_title(self, soup: BeautifulSoup, news_type: Publisher) -> str: if news_type == Publisher.SEOUL_KYUNG: title_element = soup.find("meta", property="og:title") @@ -97,3 +127,34 @@ def __find_content_from_script(self, result_html: BeautifulSoup) -> str: article_body = json_content.get("articleBody", "") return article_body + + def __find_image_han_kyung(self, soup: BeautifulSoup): + figure_img_div = soup.find('div', class_='figure-img') + + # figure-img 클래스를 가진 div 태그가 있다면 + if figure_img_div: + # 그 안의 img 태그 선택 + img_tag = figure_img_div.find('img') + if img_tag: + img_url = img_tag.get('src') + return img_url + + def __find_image_mae_kyung(self, soup: BeautifulSoup): + thumb_div = soup.find('div', class_='thumb_area img') + if thumb_div: + # 그 안의 img 태그 선택 + img_tag = thumb_div.find('img') + if img_tag: + img_url = img_tag.get('src') + return img_url + + def __find_image_seoul_kyung(self, soup: BeautifulSoup): + photo_span = soup.find('span', class_='photo') + + # photo 클래스를 가진 span 태그가 있다면 + if photo_span: + # 그 안의 img 태그 선택 + img_tag = photo_span.find('img') + if img_tag: + img_url = img_tag.get('src') + return img_url \ No newline at end of file diff --git a/app/service/simple_article_service.py b/app/service/simple_article_service.py index 90c6405..a95909b 100644 --- a/app/service/simple_article_service.py +++ b/app/service/simple_article_service.py @@ -42,7 +42,6 @@ async def process_generate_article_by_url( # JSON 객체인 ai_result를 simplified_article 객체로 변환 simplified_article = SimplifiedArticle(**ai_result) - # DB에 저장 await ArticleManageService().create_article( url=url, @@ -53,6 +52,8 @@ async def process_generate_article_by_url( simple_content=simplified_article.content, phrase=simplified_article.phrase, comment=simplified_article.comment, + published_at=request_text.pub_date, + image_url=request_text.image_url, category=MailTypeCategory(ai_result["category"]), session=session, ) diff --git a/app/service/user_type_service.py b/app/service/user_type_service.py index 0932873..8256011 100644 --- a/app/service/user_type_service.py +++ b/app/service/user_type_service.py @@ -141,7 +141,7 @@ async def create_user_type( user_types = calculate_user_type(answers, self.questionnaire_data) return await UserTypeRepository().create( user_type=UserType( - id=answers.id, + user_id=answers.id, user_type_issue_finder=user_types[UserTypes.ISSUE_FINDER.value], user_type_lifestyle_consumer=user_types[ UserTypes.LIFESTYLE_CONSUMER.value