Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/#172 크롤링 및 MQ publish 로직에 Spring Batch 적용 #178

Merged
merged 21 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
f8a0efa
Merge pull request #171 from tukcomCD2024/Dev-backend
yeonjy Jun 30, 2024
ca92364
feat: BatchConfig 구현
yeonjy Jul 7, 2024
3484b90
feat: News URL Crawling Step 구현
yeonjy Jul 7, 2024
17d319e
feat: Spring Batch 의존성 추가
yeonjy Jul 7, 2024
e25b185
feat: NewsUrlReader 구현
yeonjy Jul 14, 2024
4e4d620
refactor: NewsUrlReader 뉴스 url 하나씩 반환하도록 수정
yeonjy Jul 17, 2024
2a695b0
feat: NewsCategory enum에 url 리턴하는 메서드 구현
yeonjy Jul 17, 2024
9470276
refactor: JdbcBatchItemWriter로 데이터 저장하도록 수정
yeonjy Jul 17, 2024
95cc818
feat: InitNewsDto 구현
yeonjy Jul 17, 2024
ac63576
Updated submodule backend/core/src/main/resources/scoop-core-config
yeonjy Jul 17, 2024
c35922b
Updated submodule backend/core/src/main/resources/scoop-core-config
yeonjy Jul 18, 2024
4688723
refactor: addNewsBody에 category 삭제
yeonjy Jul 19, 2024
27964cf
refactor: BatchJobConfig batch 디렉터리로 이동
yeonjy Jul 20, 2024
55de0c2
Updated submodule backend/core/src/main/resources/scoop-core-config
yeonjy Jul 20, 2024
5d75a47
feat: BathJobConfig 구현
yeonjy Jul 20, 2024
54226ab
feat: UncrawledNewsContentReader 구현
yeonjy Jul 20, 2024
0ad7fed
refactor: batch 디렉터리로 이동
yeonjy Jul 20, 2024
9bde769
remove: BatchJobConfig
yeonjy Jul 20, 2024
20de411
feat: JobScope & StepScope 추가
yeonjy Jul 20, 2024
b6da8ca
remove: 크롤링 로직 batch 적용
yeonjy Jul 20, 2024
195ca12
Updated submodule backend/core/src/main/resources/scoop-core-config
yeonjy Jul 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions backend/core/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ dependencies {
implementation 'org.springframework.boot:spring-boot-starter-data-redis:2.3.1.RELEASE'
implementation 'org.springframework.boot:spring-boot-starter-webflux'
implementation 'com.fasterxml.jackson.core:jackson-core:2.17.0'
implementation 'org.springframework.boot:spring-boot-starter-batch'

implementation "com.querydsl:querydsl-jpa:${queryDslVersion}:jakarta"
annotationProcessor "com.querydsl:querydsl-apt:${queryDslVersion}:jakarta"
Expand All @@ -62,6 +63,7 @@ dependencies {
testImplementation 'org.springframework.security:spring-security-test'
testImplementation 'org.springframework.restdocs:spring-restdocs-mockmvc'
testImplementation 'io.rest-assured:rest-assured:5.1.1'
testImplementation 'org.springframework.batch:spring-batch-test'
}

tasks.named('bootBuildImage') {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
package com.rollthedice.backend.batch;

import com.rollthedice.backend.batch.newsContentStep.PreSummarizedNewsDto;
import com.rollthedice.backend.batch.newsContentStep.UncrawledNewsContentReader;
import com.rollthedice.backend.batch.newsUrlStep.InitNewsDto;
import com.rollthedice.backend.batch.newsUrlStep.NewsUrlReader;
import com.rollthedice.backend.domain.news.contentqueue.ContentProducer;
import com.rollthedice.backend.domain.news.dto.ContentMessageDto;
import com.rollthedice.backend.domain.news.repository.NewsRepository;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.batch.core.Job;
import org.springframework.batch.core.Step;
import org.springframework.batch.core.configuration.annotation.JobScope;
import org.springframework.batch.core.configuration.annotation.StepScope;
import org.springframework.batch.core.job.builder.JobBuilder;
import org.springframework.batch.core.launch.support.RunIdIncrementer;
import org.springframework.batch.core.repository.JobRepository;
import org.springframework.batch.core.step.builder.StepBuilder;
import org.springframework.batch.item.ItemProcessor;
import org.springframework.batch.item.ItemReader;
import org.springframework.batch.item.database.JdbcBatchItemWriter;
import org.springframework.batch.item.database.builder.JdbcBatchItemWriterBuilder;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.transaction.PlatformTransactionManager;

import javax.sql.DataSource;

@Slf4j
@Configuration
@RequiredArgsConstructor
public class BatchJobConfig {

@Value("${batch.chunk-size}")
private int chunkSize;

private final DataSource dataSource;
private final NewsRepository newsRepository;
private final ContentProducer contentProducer;

@Bean
public Job scrapJob(JobRepository jobRepository,
Step crawlingNewsUrlStep, Step crawlingNewsContentStep) {
return new JobBuilder("scrapJob", jobRepository)
.incrementer(new RunIdIncrementer())
.start(crawlingNewsUrlStep)
.next(crawlingNewsContentStep)
.build();
}

@Bean
@JobScope
public Step crawlingNewsUrlStep(JobRepository jobRepository,
PlatformTransactionManager transactionManager) {
return new StepBuilder("crawlingNewsUrlStep", jobRepository)
.allowStartIfComplete(true)
.<InitNewsDto, InitNewsDto>chunk(30, transactionManager)
.reader(newsUrlReader())
.writer(newsUrlWriter())
.build();
}

@Bean
@StepScope
public ItemReader<InitNewsDto> newsUrlReader() {
return new NewsUrlReader();
}

@Bean
@StepScope
public JdbcBatchItemWriter<InitNewsDto> newsUrlWriter() {
return new JdbcBatchItemWriterBuilder<InitNewsDto>()
.dataSource(dataSource)
.sql("insert into news(url, category) values (:url, :newsCategory)")
.beanMapped()
.build();
}

@Bean
@JobScope
public Step crawlingNewsContentStep(JobRepository jobRepository,
PlatformTransactionManager transactionManager) {
return new StepBuilder("crawlingNewsContentStep", jobRepository)
.allowStartIfComplete(true)
.<PreSummarizedNewsDto, PreSummarizedNewsDto>chunk(chunkSize, transactionManager)
.reader(uncrawledNewsContentReader())
.processor(summarizeContentProcessor())
.writer(newsContentWriter())
.build();
}

@Bean
@StepScope
public ItemReader<PreSummarizedNewsDto> uncrawledNewsContentReader() {
return new UncrawledNewsContentReader(newsRepository);
}

@Bean
@StepScope
public ItemProcessor<PreSummarizedNewsDto, PreSummarizedNewsDto> summarizeContentProcessor() {
return dto -> {
contentProducer.sendMessage(new ContentMessageDto(dto.getId(), dto.getContent()));
return dto;
};
}

@Bean
@StepScope
public JdbcBatchItemWriter<PreSummarizedNewsDto> newsContentWriter() {
return new JdbcBatchItemWriterBuilder<PreSummarizedNewsDto>()
.dataSource(dataSource)
.sql("update news set title = :title, content = :content, post_date = :postDate" +
" where id = :id")
.beanMapped()
.build();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package com.rollthedice.backend.batch.newsContentStep;

import lombok.*;

@Getter
@NoArgsConstructor
@AllArgsConstructor
@Builder
public class PreSummarizedNewsDto {
private Long id;
private String title;
private String content;
private String postDate;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package com.rollthedice.backend.batch.newsContentStep;

import com.rollthedice.backend.domain.news.entity.News;
import com.rollthedice.backend.domain.news.repository.NewsRepository;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.batch.item.ItemReader;

import java.io.IOException;
import java.util.Iterator;

public class UncrawledNewsContentReader implements ItemReader<PreSummarizedNewsDto> {
private final NewsRepository newsRepository;
private Iterator<News> uncrawledNewsContents;

public UncrawledNewsContentReader(NewsRepository newsRepository) {
this.newsRepository = newsRepository;
uncrawledNewsContents = newsRepository.findAllByContentIsNull().iterator();
}

@Override
public PreSummarizedNewsDto read() throws IOException {
if (!hasNextUncrawledNews()) {
return null;
}
News news = uncrawledNewsContents.next();
Document doc = Jsoup.connect(news.getUrl()).get();
return getNewsContent(news, doc);
}

private boolean hasNextUncrawledNews() {
if (!uncrawledNewsContents.hasNext()) {
uncrawledNewsContents = newsRepository.findAllByContentIsNull().iterator();
}
return uncrawledNewsContents.hasNext();
}

private PreSummarizedNewsDto getNewsContent(News news, Document doc) {
return PreSummarizedNewsDto.builder()
.id(news.getId())
.title(scrapTitle(doc))
.content(scrapContent(doc))
.postDate(scrapPostDate(doc))
.build();
}

private String scrapTitle(final Document doc) {
Element titleElement = doc.selectFirst("#ct > div.media_end_head.go_trans > div.media_end_head_title > h2");
if (titleElement == null) {
titleElement = doc.selectFirst("#content > div.end_ct > div > h2");
}
if (titleElement != null) {
return titleElement.text();
}
return null;
}

private String scrapContent(final Document doc) {
Elements contentElements = doc.select("article#dic_area");
if (contentElements.isEmpty()) {
contentElements = doc.select("#articeBody");
}
return contentElements.outerHtml().replaceAll("\\<[^>]*>|\\n", "");
}

private String scrapPostDate(final Document doc) {
Element dateElement = doc.selectFirst("div#ct> div.media_end_head.go_trans > div.media_end_head_info.nv_notrans > div.media_end_head_info_datestamp > div > span");
if (dateElement != null) {
return dateElement.attr("data-date-time");
} else {
Element altDateElement = doc.selectFirst("#content > div.end_ct > div > div.article_info > span > em");
if (altDateElement != null) {
return altDateElement.text();
}
}
return null;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package com.rollthedice.backend.batch.newsUrlStep;

import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;

@Getter
@NoArgsConstructor
@AllArgsConstructor
@Setter
public class InitNewsDto {
private String newsCategory;
private String url;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package com.rollthedice.backend.batch.newsUrlStep;

import com.rollthedice.backend.domain.news.entity.NewsCategory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.batch.item.ItemReader;
import org.springframework.beans.factory.annotation.Value;

import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;

public class NewsUrlReader implements ItemReader<InitNewsDto> {
@Value("${crawling.quantity}")
private int crawlingQuantity;

private final Iterator<NewsCategory> categories;
private final Queue<InitNewsDto> initNews = new LinkedList<>();

public NewsUrlReader() {
categories = Arrays.stream(NewsCategory.values()).collect(Collectors.toList()).iterator();
}

@Override
public InitNewsDto read() throws IOException {
while (initNews.isEmpty() && categories.hasNext()) {
NewsCategory category = categories.next();
initNews.addAll(scrapCategoryNews(category));
}
return initNews.poll();
}

private List<InitNewsDto> scrapCategoryNews(NewsCategory category) throws IOException {
Document doc = Jsoup.connect(category.getCategoryUrl()).get();
Elements newsList = doc.select(".sa_list").select("li");
if (newsList.size() < crawlingQuantity) {
return scrapNewsUrl(newsList.size(), newsList, category);
}
return scrapNewsUrl(crawlingQuantity, newsList, category);
}

private List<InitNewsDto> scrapNewsUrl(int quantity, Elements newsList, NewsCategory category) {
List<InitNewsDto> urls = new ArrayList<>();
for (int i = 0; i < quantity; i++) {
Element news = newsList.get(i);
String url = Objects.requireNonNull(news.selectFirst(".sa_text_title")).attr("href");
urls.add(new InitNewsDto(category.getName(), url));
}
return urls;
}
}
Loading
Loading