Skip to content

Commit

Permalink
Merge pull request #26 from tukcomCD2024/feat/#25-backend-crawling
Browse files Browse the repository at this point in the history
Feat/#25 Backend Crawling (뉴스 정보 크롤링)
  • Loading branch information
yeonjy authored Mar 18, 2024
2 parents 8a32a42 + df662c3 commit eef9c84
Show file tree
Hide file tree
Showing 21 changed files with 406 additions and 46 deletions.
4 changes: 4 additions & 0 deletions backend/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ dependencies {
runtimeOnly 'com.h2database:h2'
runtimeOnly 'com.mysql:mysql-connector-j'
annotationProcessor 'org.projectlombok:lombok'

// jsoup 의존성 추가
implementation 'org.jsoup:jsoup:1.15.3'

testImplementation 'org.springframework.boot:spring-boot-starter-test'
testImplementation 'org.springframework.security:spring-security-test'
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
package com.rollthedice.backend.domain.crawling;

import com.rollthedice.backend.domain.dto.NewsUrlDto;
import com.rollthedice.backend.domain.news.entity.News;
import com.rollthedice.backend.domain.news.service.NewsCategory;
import com.rollthedice.backend.domain.news.service.NewsService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Service;

import java.io.IOException;
import java.util.Objects;

@Slf4j
@Service
@RequiredArgsConstructor
public class NewsCrawlingService {
private static final String MAIN_URL = "https://news.naver.com/section/";
private static final String CRON = "0 0 6,12 * * *";
private static final String ZONE = "Asia/Seoul";

private final NewsService newsService;

@Scheduled(cron = CRON, zone = ZONE)
public void scrap() throws IOException {
for (NewsCategory category : NewsCategory.values()) {
String categoryUrl = MAIN_URL + category.getNum();
String categoryName = category.getName();

scrapNewsUrls(categoryUrl);
for (final News news : newsService.getAllNews()) {
scrapNewsContentsAndUpdate(categoryName, news);
}
}
}

private void scrapNewsUrls(String categoryUrl) throws IOException {
Document doc = Jsoup.connect(categoryUrl).get();
Elements newsList = doc.select(".sa_list");

for (Element news : newsList.select("li")) {
String thumbnailUrl = scrapThumbnailUrl(news);
String url = Objects.requireNonNull(news.selectFirst(".sa_text_title")).attr("href");

newsService.addNews(new NewsUrlDto(url, thumbnailUrl));
}
}

private String scrapThumbnailUrl(final Element news) {
try {
Element thumbnailUrlElement = news.selectFirst(".sa_thumb_link img");
return thumbnailUrlElement != null ? thumbnailUrlElement.attr("src") : null;
} catch (NullPointerException e) {
return null;
}
}

private void scrapNewsContentsAndUpdate(String categoryName, News news) throws IOException {
Document doc = Jsoup.connect(news.getUrl()).get();

String title = scrapTitle(doc);
String content = scrapContent(doc);
String postDate = scrapPostDate(doc);

news.addNewsBody(title, content, categoryName, postDate);
}

private String scrapTitle(final Document doc) {
Element titleElement = doc.selectFirst("#ct > div.media_end_head.go_trans > div.media_end_head_title > h2");
if (titleElement == null) {
titleElement = doc.selectFirst("#content > div.end_ct > div > h2");
}
if (titleElement != null) {
return titleElement.text();
}
return null;
}

private String scrapContent(final Document doc) {
Elements contentElements = doc.select("article#dic_area");
if (contentElements.isEmpty()) {
contentElements = doc.select("#articeBody");
}
return contentElements.outerHtml().replaceAll("\\<[^>]*>|\\n", "");
}

private String scrapPostDate(final Document doc) {
Element dateElement = doc.selectFirst("div#ct> div.media_end_head.go_trans > div.media_end_head_info.nv_notrans > div.media_end_head_info_datestamp > div > span");
if (dateElement != null) {
return dateElement.attr("data-date-time");
} else {
Element altDateElement = doc.selectFirst("#content > div.end_ct > div > div.article_info > span > em");
if (altDateElement != null) {
return altDateElement.text();
}
}
return null;
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package com.rollthedice.backend.domain.dto;

import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;

@Getter
@NoArgsConstructor
@AllArgsConstructor
@Setter
public class NewsUrlDto {
private String url;
private String thumbnailUrl;
}
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
package com.rollthedice.backend.domain.news.entity;

import com.rollthedice.backend.global.config.BaseTimeEntity;
import jakarta.persistence.Entity;
import jakarta.persistence.GeneratedValue;
import jakarta.persistence.GenerationType;
import jakarta.persistence.Id;
import jakarta.persistence.*;
import lombok.AccessLevel;
import lombok.Builder;
import lombok.Getter;
import lombok.NoArgsConstructor;

Expand All @@ -17,9 +15,30 @@ public class News extends BaseTimeEntity {
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Long id;

private String url;
private String title;

@Lob
private String content;
private String thumbnail;
private String thumbnailUrl;
private String category;
private String postDate;

@Builder
public News(String url, String thumbnailUrl) {
this.url = url;
this.thumbnailUrl = thumbnailUrl;
}

public void addNewsBody(String title, String content, String category, String postDate) {
this.title = title;
this.content = content;
this.category = category;
this.postDate = postDate;
}

public void updateSummarizedContent(String content) {
this.content = content;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package com.rollthedice.backend.domain.news.repository;

import com.rollthedice.backend.domain.news.entity.News;
import org.springframework.data.jpa.repository.JpaRepository;

public interface NewsRepository extends JpaRepository<News, Long> {
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package com.rollthedice.backend.domain.news.service;

import lombok.Getter;

@Getter
public enum NewsCategory {
POLITICS(100, "정치"),
ECONOMY(101, "경제"),
SOCIETY(102, "사회"),
LIVING(103, "생활/문화"),
WORLD(104, "세계"),
SCIENCE(105, "IT/과학");

private final int num;
private final String name;

NewsCategory(int num, String name) {
this.num = num;
this.name = name;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package com.rollthedice.backend.domain.news.service;

import com.rollthedice.backend.domain.dto.NewsUrlDto;
import com.rollthedice.backend.domain.news.entity.News;
import com.rollthedice.backend.domain.news.repository.NewsRepository;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;

import java.util.List;

@Slf4j
@Service
@RequiredArgsConstructor
public class NewsService {
private final NewsRepository newsRepository;

@Transactional
public void addNews(NewsUrlDto dto) {
newsRepository.save(News.builder().thumbnailUrl(dto.getThumbnailUrl()).url(dto.getUrl()).build());
}

@Transactional(readOnly = true)
public List<News> getAllNews() {
return newsRepository.findAll();
}

}
16 changes: 16 additions & 0 deletions iOS/RollTheDice/RollTheDice.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
6C3237AC2B7C382200B699AB /* News.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6C3237AB2B7C382200B699AB /* News.swift */; };
6C3237AE2B7C382E00B699AB /* NewsViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6C3237AD2B7C382E00B699AB /* NewsViewModel.swift */; };
6C3237B22B7C385000B699AB /* NewsListViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6C3237B12B7C385000B699AB /* NewsListViewModel.swift */; };
6C3237B52B7C433D00B699AB /* ChatTypeView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6C3237B42B7C433D00B699AB /* ChatTypeView.swift */; };
6C3237B72B7C434600B699AB /* ChatType.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6C3237B62B7C434600B699AB /* ChatType.swift */; };
6C77048C2B722686001B17CB /* MainTabView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6C77048B2B722686001B17CB /* MainTabView.swift */; };
6C77048F2B7229B1001B17CB /* NewsListView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6C77048E2B7229B1001B17CB /* NewsListView.swift */; };
6C7704992B722A20001B17CB /* MainTabViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6C7704982B722A20001B17CB /* MainTabViewModel.swift */; };
Expand Down Expand Up @@ -76,6 +78,8 @@
6C3237AB2B7C382200B699AB /* News.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = News.swift; sourceTree = "<group>"; };
6C3237AD2B7C382E00B699AB /* NewsViewModel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = NewsViewModel.swift; sourceTree = "<group>"; };
6C3237B12B7C385000B699AB /* NewsListViewModel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = NewsListViewModel.swift; sourceTree = "<group>"; };
6C3237B42B7C433D00B699AB /* ChatTypeView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatTypeView.swift; sourceTree = "<group>"; };
6C3237B62B7C434600B699AB /* ChatType.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatType.swift; sourceTree = "<group>"; };
6C77048B2B722686001B17CB /* MainTabView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MainTabView.swift; sourceTree = "<group>"; };
6C77048E2B7229B1001B17CB /* NewsListView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = NewsListView.swift; sourceTree = "<group>"; };
6C7704982B722A20001B17CB /* MainTabViewModel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MainTabViewModel.swift; sourceTree = "<group>"; };
Expand Down Expand Up @@ -221,6 +225,15 @@
path = NewsCard;
sourceTree = "<group>";
};
6C3237B32B7C433000B699AB /* ChatType */ = {
isa = PBXGroup;
children = (
6C3237B42B7C433D00B699AB /* ChatTypeView.swift */,
6C3237B62B7C434600B699AB /* ChatType.swift */,
);
path = ChatType;
sourceTree = "<group>";
};
6C7704852B72260F001B17CB /* Resources */ = {
isa = PBXGroup;
children = (
Expand Down Expand Up @@ -275,6 +288,7 @@
6C7704902B7229B6001B17CB /* Chat */ = {
isa = PBXGroup;
children = (
6C3237B32B7C433000B699AB /* ChatType */,
352EE6852B7611F500E51B60 /* Model */,
352EE6842B7611E900E51B60 /* View */,
);
Expand Down Expand Up @@ -442,13 +456,15 @@
6C3237A72B7C37E500B699AB /* BookmarkListViewModel.swift in Sources */,
35AA27542B7A1489004EFFAE /* AIChatCustomView.swift in Sources */,
35AA27452B79FFAB004EFFAE /* MockChatInteractor.swift in Sources */,
6C3237B52B7C433D00B699AB /* ChatTypeView.swift in Sources */,
6C3237B22B7C385000B699AB /* NewsListViewModel.swift in Sources */,
35AA272F2B79FCAE004EFFAE /* ChatExampleView.swift in Sources */,
35AA27382B79FE96004EFFAE /* MockMessage.swift in Sources */,
352EE6832B76103400E51B60 /* ChatTitleView.swift in Sources */,
6C32379F2B7C376D00B699AB /* Bookmark.swift in Sources */,
6C7704A12B722CEB001B17CB /* ProfileView.swift in Sources */,
6C77049F2B722CE4001B17CB /* ARView.swift in Sources */,
6C3237B72B7C434600B699AB /* ChatType.swift in Sources */,
6CC4DDC72B5574670080E7E8 /* RollTheDiceApp.swift in Sources */,
6C77048C2B722686001B17CB /* MainTabView.swift in Sources */,
6C7704992B722A20001B17CB /* MainTabViewModel.swift in Sources */,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ public class BookmarkListViewModel: ObservableObject {
init(
bookmarkList: [Bookmark] = [
.init(title: "2024년 ‘소셜 미디어 다이어트’를 위해 바꿔볼 것", date: "2023년12월3일", image: "exampleNews", content: "2024년으로 접어든지 한 달이 넘었다. 하지만 올 해가 어떻게 흘러갈지 예측하기는 쉽지 않다. 한 가지 확실한 것은 정치적으로 매우 중요한 해라는 점이다. 미국과 러시아, 우크라이나, 방글라데시, 인도, 대만, 한국, 남아프리카공화국, 유럽의회, 영국에서 선거가 치러질 예정이다.", isBookmarked: false),
.init(title: "2024년 ‘소셜 미디어 다이어트’를 위해 바꿔볼 것", date: "2023년12월3일", image: "exampleNews", content: "2024년으로 접어든지 한 달이 넘었다. 하지만 올 해가 어떻게 흘러갈지 예측하기는 쉽지 않다. 한 가지 확실한 것은 정치적으로 매우 중요한 해라는 점이다. 미국과 러시아, 우크라이나, 방글라데시, 인도, 대만, 한국, 남아프리카공화국, 유럽의회, 영국에서 선거가 치러질 예정이다.", isBookmarked: false),
.init(title: "2024년 ‘소셜 미디어 다이어트’를 위해 바꿔볼 것", date: "2023년12월3일", image: "exampleNews", content: "2024년으로 접어든지 한 달이 넘었다. 하지만 올 해가 어떻게 흘러갈지 예측하기는 쉽지 않다. 한 가지 확실한 것은 정치적으로 매우 중요한 해라는 점이다. 미국과 러시아, 우크라이나, 방글라데시, 인도, 대만, 한국, 남아프리카공화국, 유럽의회, 영국에서 선거가 치러질 예정이다.", isBookmarked: false),
.init(title: "2024년 ‘소셜 미디어 다이어트’를 위해 바꿔볼 것", date: "2023년12월3일", image: "exampleNews", content: "2024년으로 접어든지 한 달이 넘었다. 하지만 올 해가 어떻게 흘러갈지 예측하기는 쉽지 않다. 한 가지 확실한 것은 정치적으로 매우 중요한 해라는 점이다. 미국과 러시아, 우크라이나, 방글라데시, 인도, 대만, 한국, 남아프리카공화국, 유럽의회, 영국에서 선거가 치러질 예정이다.", isBookmarked: false),
.init(title: "해외원정 가던 줄기세포치료 이젠 국내서도 받을 수 있다", date: "2023년2월13일", image: "exampleNews", content: "첨생법 개정안이 이달 1일 국회를 통과해 내년부터 시행된다. 이에 따라 식약처 허가 없이도 안전성·유효성이 확인되면 국내에서도 첨단재생의료 치료가 허용되고 모든 질환에 대한 임상연구가 가능해졌다. ", isBookmarked: false),
.init(title: "홀로 선 자립준비청년, 배곯지 않게…우체국, 매일 식비 지원", date: "2023년2월13일", content: "과학기술정보통신부 우정사업본부는 사회에 첫발을 내딛는 자립준비청년이 건강한 사회구성원으로 성장하도록 식비를 지원하는 '우체국 청년밥심 스타트 온(溫)' 사업을 확대 추진한다고 14일 밝혔다.", isBookmarked: false),
.init(title: "NHN, 작년 영업익 555억원...전년비 42% ↑", date: "2023년2월13일", image: "exampleNews", content: "2NHN은 연결기준 지난해 영업이익이 555억원으로 전년 대비 42.2% 증가했다고 14일 밝혔다.같은 기간 매출은 7.3% 증가한 2조2696억원으로 연간 최대치를 기록했다. 작년 4분기 매출은 5983억원으로 전년 동기 대비 6.7% 올랐다. 반면 영업손실은 78억원으로 적자전환했다. 커머스 부문의 장기 미회수채권 대손상각비 인식과 기술 부문의 기 인식 매출 차감 등 일회성 요인이 영향을 미쳤다.", isBookmarked: false),
]
) {
self.bookmarkList = bookmarkList
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//
// ChatType.swift
// RollTheDice
//
// Created by Subeen on 2/14/24.
//

import Foundation

enum ChatType: String {
case ai
case user

var name: String {
switch self {
case .ai:
return "ai"
case .user:
return "user"
}
}
}
Loading

0 comments on commit eef9c84

Please sign in to comment.