From 73fa4190f328c4d22262ae026c0199d4bcf20032 Mon Sep 17 00:00:00 2001 From: yeonjy Date: Wed, 3 Apr 2024 03:09:24 +0900 Subject: [PATCH] =?UTF-8?q?refactor:=20=EC=B9=B4=ED=85=8C=EA=B3=A0?= =?UTF-8?q?=EB=A6=AC=EB=B3=84=20=ED=81=AC=EB=A1=A4=EB=A7=81=20=EA=B0=9C?= =?UTF-8?q?=EC=88=98=20=EC=A0=9C=ED=95=9C=ED=95=98=EB=8F=84=EB=A1=9D=20?= =?UTF-8?q?=EB=A1=9C=EC=A7=81=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../domain/crawling/NewsCrawlingService.java | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/backend/core/src/main/java/com/rollthedice/backend/domain/crawling/NewsCrawlingService.java b/backend/core/src/main/java/com/rollthedice/backend/domain/crawling/NewsCrawlingService.java index 461267c5..28e72e9e 100644 --- a/backend/core/src/main/java/com/rollthedice/backend/domain/crawling/NewsCrawlingService.java +++ b/backend/core/src/main/java/com/rollthedice/backend/domain/crawling/NewsCrawlingService.java @@ -10,6 +10,7 @@ import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; +import org.springframework.beans.factory.annotation.Value; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; @@ -25,6 +26,9 @@ public class NewsCrawlingService { private static final String CRON = "0 0 6,12 * * *"; private static final String ZONE = "Asia/Seoul"; + @Value("${crawling.quantity}") + private int crawlingQuantity; + private final NewsService newsService; @Transactional @@ -34,19 +38,32 @@ public void scrap() throws IOException { String categoryUrl = MAIN_URL + category.getNum(); String categoryName = category.getName(); - scrapNewsUrls(categoryUrl); + scrapCategoryNews(categoryUrl); for (final News news : newsService.getNotCrawled()) { - scrapNewsContentsAndUpdate(categoryName, news); + Document doc = Jsoup.connect(news.getUrl()).get(); + String title = scrapTitle(doc); + String content = scrapContent(doc); + String postDate = scrapPostDate(doc); + + news.addNewsBody(title, content, categoryName, postDate); } } newsService.summarizeNewsContent(); } - private void scrapNewsUrls(String categoryUrl) throws IOException { + private void scrapCategoryNews(String categoryUrl) throws IOException { Document doc = Jsoup.connect(categoryUrl).get(); - Elements newsList = doc.select(".sa_list"); + Elements newsList = doc.select(".sa_list").select("li"); + if (newsList.size() < crawlingQuantity) { + scrapNewsUrl(newsList.size(), newsList); + return; + } + scrapNewsUrl(crawlingQuantity, newsList); + } - for (Element news : newsList.select("li")) { + private void scrapNewsUrl(int quantity, Elements newsList) { + for (int i = 0; i < quantity; i++) { + Element news = newsList.get(i); String thumbnailUrl = scrapThumbnailUrl(news); String url = Objects.requireNonNull(news.selectFirst(".sa_text_title")).attr("href");