-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: 가정통신문 크롤링 및 요약 스케쥴러 추가 #71
Changes from 17 commits
2d5a45a
be9c52c
dd850cd
7b685fa
f5e771b
3dfe67d
44a347a
64157a1
4cb70b7
dedf7e4
229dc1f
8a887f3
1f26ce8
c861de9
10c52ff
5f15446
248e4fd
e2926e6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
package com.asap.asapbackend.batch.announcement | ||
|
||
import com.asap.asapbackend.domain.announcement.domain.model.EducationOfficeAnnouncement | ||
import com.asap.asapbackend.domain.announcement.domain.model.SchoolAnnouncement | ||
import com.asap.asapbackend.domain.announcement.domain.service.AnnouncementAppender | ||
import com.asap.asapbackend.domain.announcement.domain.service.AnnouncementReader | ||
import com.asap.asapbackend.global.util.ImageToTextConverter | ||
import com.asap.asapbackend.global.util.TextSummaryHelper | ||
import com.asap.asapbackend.global.util.TransactionUtils | ||
import io.github.oshai.kotlinlogging.KotlinLogging | ||
import org.springframework.scheduling.annotation.Scheduled | ||
import org.springframework.stereotype.Component | ||
|
||
private val logger = KotlinLogging.logger {} | ||
|
||
@Component | ||
class AnnouncementScheduler( | ||
private val schoolAnnouncementInfoProvider: SchoolAnnouncementInfoProvider, | ||
private val educationOfficeAnnouncementInfoProvider: EducationOfficeAnnouncementInfoProvider, | ||
private val imageToTextConverter: ImageToTextConverter, | ||
private val textSummaryHelper: TextSummaryHelper, | ||
private val announcementAppender: AnnouncementAppender, | ||
private val announcementReader: AnnouncementReader | ||
) { | ||
// 매 평일 9시부터 18시까지 1시간마다 실행 | ||
@Scheduled(cron = "0 0 9-18 * * MON-FRI") | ||
fun addAnnouncement() { | ||
val batchSize = 100 | ||
var pageNumber = 0 | ||
do { | ||
val announcementDataContainer = | ||
schoolAnnouncementInfoProvider.retrieveAnnouncementInfo(batchSize, pageNumber) | ||
|
||
pageNumber++ | ||
|
||
val announcements = announcementDataContainer.schoolAnnouncementInfo.groupBy { it.index } | ||
.map { | ||
it.value.reduce { acc, educationOfficeAnnouncementInfo -> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. acc가 어떤건지 잘 모르겠어서 바꿔볼까..?? |
||
acc.copy( | ||
title = acc.title.isEmpty().let { educationOfficeAnnouncementInfo.title }, | ||
imageUrls = acc.imageUrls + educationOfficeAnnouncementInfo.imageUrls.sorted() | ||
) | ||
} | ||
} | ||
.map { | ||
val textFromImage = imageToTextConverter.convertImageToText(it.imageUrls) | ||
var summarizedText = listOf<String>() | ||
if (textFromImage.isNotEmpty()) { | ||
summarizedText = textSummaryHelper.summarizeText(textFromImage) | ||
} | ||
return@map SchoolAnnouncement( | ||
schoolAnnouncementPage = it.schoolAnnouncementPage, | ||
index = it.index, | ||
title = it.title, | ||
imageUrls = it.imageUrls, | ||
summaries = summarizedText | ||
) | ||
} | ||
|
||
TransactionUtils.writable { | ||
announcementAppender.addSchoolAnnouncements(announcements.toSet()) | ||
} | ||
} while (announcementDataContainer.hasNext) | ||
} | ||
|
||
|
||
@Scheduled(cron = "0 0 9-18 * * MON-FRI") | ||
fun addEducationOfficeAnnouncement() { | ||
val batchSize = 100 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 현재 학교에서 10개만 가져오는데 batchsize가 100일 필요가 왜 있는지 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 차라리 batch랑 hasNext 지우는걸로 할까 |
||
var startIdx = announcementReader.getLastOfficeEducationAnnouncementId() | ||
do { | ||
val announcementDataContainer = | ||
educationOfficeAnnouncementInfoProvider.retrieveAnnouncementInfo(batchSize, startIdx) | ||
|
||
startIdx += batchSize | ||
|
||
val announcements = announcementDataContainer.educationOfficeAnnouncementInfo.groupBy { it.index } | ||
.map { | ||
it.value.reduce { acc, educationOfficeAnnouncementInfo -> | ||
acc.copy( | ||
title = acc.title, | ||
imageUrls = acc.imageUrls + educationOfficeAnnouncementInfo.imageUrls.sorted() | ||
) | ||
} | ||
} | ||
.map { | ||
val textFromImage = imageToTextConverter.convertImageToText(it.imageUrls) | ||
var summarizedText = listOf<String>() | ||
if (textFromImage.isNotEmpty()) { | ||
summarizedText = textSummaryHelper.summarizeText(textFromImage) | ||
} | ||
return@map EducationOfficeAnnouncement( | ||
idx = it.index, | ||
title = it.title, | ||
imageUrls = it.imageUrls, | ||
summaries = summarizedText | ||
) | ||
} | ||
|
||
TransactionUtils.writable { | ||
announcementAppender.addEducationOfficeAnnouncements(announcements.toSet()) | ||
} | ||
} while (announcementDataContainer.hasNext) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
package com.asap.asapbackend.batch.announcement | ||
|
||
interface EducationOfficeAnnouncementInfoProvider { | ||
|
||
fun retrieveAnnouncementInfo(batchSize: Int, startIndex: Int): EducationOfficeAnnouncementDataContainer | ||
|
||
data class EducationOfficeAnnouncementDataContainer( | ||
val educationOfficeAnnouncementInfo: List<EducationOfficeAnnouncementInfo>, | ||
val hasNext: Boolean | ||
) | ||
|
||
data class EducationOfficeAnnouncementInfo( | ||
val index: Int, | ||
val title: String, | ||
val imageUrls: List<String> | ||
) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
package com.asap.asapbackend.batch.announcement | ||
|
||
import com.asap.asapbackend.domain.announcement.domain.model.SchoolAnnouncementPage | ||
|
||
interface SchoolAnnouncementInfoProvider { | ||
fun retrieveAnnouncementInfo(batchSize: Int, pageNumber: Int): SchoolAnnouncementDataContainer | ||
|
||
|
||
data class SchoolAnnouncementDataContainer( | ||
val schoolAnnouncementInfo: List<SchoolAnnouncementInfo>, | ||
val hasNext: Boolean | ||
) | ||
|
||
data class SchoolAnnouncementInfo( | ||
val schoolAnnouncementPage: SchoolAnnouncementPage, | ||
val index: Int, | ||
val title: String, | ||
val imageUrls: List<String> | ||
) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
package com.asap.asapbackend.client.crawling.announcement | ||
|
||
import com.asap.asapbackend.batch.announcement.EducationOfficeAnnouncementInfoProvider | ||
import com.asap.asapbackend.client.crawling.announcement.dto.AnnouncementCrawlingResponse | ||
import org.springframework.stereotype.Component | ||
import org.springframework.web.reactive.function.client.WebClient | ||
import java.time.Duration | ||
|
||
@Component | ||
class EducationOfficeAnnouncementCrawlingClient : EducationOfficeAnnouncementInfoProvider { | ||
override fun retrieveAnnouncementInfo( | ||
batchSize: Int, | ||
startIndex: Int | ||
): EducationOfficeAnnouncementInfoProvider.EducationOfficeAnnouncementDataContainer { | ||
val announcements = sendCrawlingRequest(startIndex, batchSize.toLong()) | ||
return EducationOfficeAnnouncementInfoProvider.EducationOfficeAnnouncementDataContainer( | ||
educationOfficeAnnouncementInfo = announcements, | ||
hasNext = (announcements.size == batchSize) | ||
tlarbals824 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
} | ||
|
||
private fun sendCrawlingRequest(startIdx: Int, batchSize: Long): List<EducationOfficeAnnouncementInfoProvider.EducationOfficeAnnouncementInfo> { | ||
return WebClient.create(CRAWLING_SERVER_URL) | ||
.get() | ||
.uri { | ||
it.queryParam("element_school_url", OFFICE_EDUCATION_URL) | ||
.queryParam("start_idx", startIdx) | ||
.queryParam("batch_size", batchSize) | ||
.build() | ||
} | ||
.retrieve() | ||
.bodyToMono(AnnouncementCrawlingResponse::class.java) | ||
.timeout(Duration.ofMinutes(20)) | ||
.map { | ||
it.convertToAnnouncement() | ||
} | ||
.block() ?: emptyList() | ||
} | ||
|
||
|
||
companion object{ | ||
private const val OFFICE_EDUCATION_URL = "https://yangwonsoop.sen.es.kr/192786/subMenu.do" | ||
private const val CRAWLING_SERVER_URL = "http://crawling.ncp.simproject.kr:3000/crawl" | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
package com.asap.asapbackend.client.crawling.announcement | ||
|
||
import com.asap.asapbackend.batch.announcement.SchoolAnnouncementInfoProvider | ||
import com.asap.asapbackend.client.crawling.announcement.dto.AnnouncementCrawlingResponse | ||
import com.asap.asapbackend.domain.announcement.domain.model.SchoolAnnouncementPage | ||
import com.asap.asapbackend.domain.announcement.domain.repository.SchoolAnnouncementPageRepository | ||
import com.asap.asapbackend.domain.announcement.domain.repository.SchoolAnnouncementRepository | ||
import org.springframework.data.domain.PageRequest | ||
import org.springframework.stereotype.Component | ||
import org.springframework.web.reactive.function.client.WebClient | ||
import reactor.core.publisher.Flux | ||
import java.time.Duration | ||
|
||
@Component | ||
class SchoolAnnouncementCrawlingClient( | ||
private val schoolAnnouncementPageRepository: SchoolAnnouncementPageRepository, | ||
private val schoolAnnouncementRepository: SchoolAnnouncementRepository | ||
) : SchoolAnnouncementInfoProvider { | ||
|
||
override fun retrieveAnnouncementInfo( | ||
batchSize: Int, | ||
pageNumber: Int | ||
): SchoolAnnouncementInfoProvider.SchoolAnnouncementDataContainer { | ||
val schoolAnnouncements = schoolAnnouncementPageRepository.findAll(PageRequest.of(pageNumber, batchSize)) | ||
val hasNext = schoolAnnouncements.hasNext() | ||
val announcementFluxes = schoolAnnouncements.map { schoolAnnouncement -> | ||
val startIdx = schoolAnnouncementRepository.findLastIndex(schoolAnnouncement.getSchoolId()) | ||
retrieveAnnouncementInfoFromCrawlingServer(schoolAnnouncement, startIdx, batchSize) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. startIdx+1을 넣도록 |
||
} | ||
val schoolAnnouncementInfoList = mutableListOf<SchoolAnnouncementInfoProvider.SchoolAnnouncementInfo>() | ||
Flux.merge(announcementFluxes) | ||
.buffer(1000) | ||
.flatMap { | ||
Flux.fromIterable(it) | ||
.doOnNext(schoolAnnouncementInfoList::add) | ||
.then() | ||
}.blockLast() | ||
return SchoolAnnouncementInfoProvider.SchoolAnnouncementDataContainer( | ||
schoolAnnouncementInfo = schoolAnnouncementInfoList, | ||
hasNext = hasNext | ||
) | ||
} | ||
|
||
private fun retrieveAnnouncementInfoFromCrawlingServer(schoolAnnouncementPage: SchoolAnnouncementPage, startIdx: Int, batchSize: Int): Flux<SchoolAnnouncementInfoProvider.SchoolAnnouncementInfo> { | ||
return WebClient.create(CRAWLING_SERVER_URL) | ||
.get() | ||
.uri { uriBuilder -> | ||
uriBuilder | ||
.queryParam("start_idx", startIdx) | ||
.queryParam("batch_size", batchSize) | ||
.queryParam("element_school_url", schoolAnnouncementPage.schoolAnnouncementPageUrl) | ||
.build() | ||
} | ||
.retrieve() | ||
.bodyToMono(AnnouncementCrawlingResponse::class.java) | ||
.timeout(Duration.ofMinutes(10)) | ||
.map{ | ||
it.convertToAnnouncement(schoolAnnouncementPage) | ||
}.flatMapMany { Flux.fromIterable(it)} | ||
} | ||
|
||
companion object{ | ||
private val CRAWLING_SERVER_URL = "http://crawling.ncp.simproject.kr:3000/crawl" | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
package com.asap.asapbackend.client.crawling.announcement.dto | ||
|
||
import com.asap.asapbackend.batch.announcement.EducationOfficeAnnouncementInfoProvider | ||
import com.asap.asapbackend.batch.announcement.SchoolAnnouncementInfoProvider | ||
import com.asap.asapbackend.domain.announcement.domain.model.SchoolAnnouncementPage | ||
|
||
data class AnnouncementCrawlingResponse( | ||
val data: List<AnnouncementDetail> | ||
){ | ||
fun convertToAnnouncement(schoolAnnouncementPage: SchoolAnnouncementPage): List<SchoolAnnouncementInfoProvider.SchoolAnnouncementInfo> { | ||
return data.flatMap { | ||
it.file_info.map { fileInfo -> | ||
SchoolAnnouncementInfoProvider.SchoolAnnouncementInfo( | ||
schoolAnnouncementPage = schoolAnnouncementPage, | ||
index = it.idx.toInt(), | ||
title = fileInfo.title, | ||
imageUrls = fileInfo.image_url | ||
) | ||
} | ||
} | ||
} | ||
|
||
fun convertToAnnouncement(): List<EducationOfficeAnnouncementInfoProvider.EducationOfficeAnnouncementInfo> { | ||
return data.flatMap { | ||
it.file_info.map { fileInfo -> | ||
EducationOfficeAnnouncementInfoProvider.EducationOfficeAnnouncementInfo( | ||
index = it.idx.toInt(), | ||
title = fileInfo.title, | ||
imageUrls = fileInfo.image_url | ||
) | ||
} | ||
} | ||
} | ||
} | ||
|
||
data class AnnouncementDetail( | ||
val idx: String, | ||
val file_info: List<AnnouncementFileInfo> | ||
) | ||
|
||
data class AnnouncementFileInfo( | ||
val title: String, | ||
val image_url: List<String> | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
package com.asap.asapbackend.client.ncp | ||
|
||
import org.springframework.boot.context.properties.ConfigurationProperties | ||
|
||
@ConfigurationProperties(prefix = "key.ncp") | ||
data class NcpApiProperties( | ||
val ocrKey: String | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
package com.asap.asapbackend.client.ncp.ocr | ||
|
||
import com.asap.asapbackend.client.ncp.NcpApiProperties | ||
import com.asap.asapbackend.client.ncp.ocr.dto.NcpOcrResponse | ||
import com.asap.asapbackend.global.util.ImageToTextConverter | ||
import org.springframework.stereotype.Component | ||
import org.springframework.web.reactive.function.client.WebClient | ||
import reactor.core.publisher.Flux | ||
import reactor.core.publisher.Mono | ||
import java.util.* | ||
|
||
@Component | ||
class NcpImageToTextConverter( | ||
private val ncpApiProperties: NcpApiProperties | ||
) : ImageToTextConverter { | ||
override fun convertImageToText(imageUrls: List<String>): String { | ||
val ncpOcrResponse = imageUrls | ||
.map { sendRequestToNcpOcrApi(it) } | ||
|
||
val text = Flux.fromIterable(ncpOcrResponse) | ||
.flatMap { | ||
it.map { response -> | ||
response?.getInferText() ?: "" | ||
}.switchIfEmpty(Mono.just("")) | ||
} | ||
.blockLast() | ||
return text ?: "" | ||
} | ||
|
||
|
||
private fun sendRequestToNcpOcrApi(imageUrl: String): Mono<NcpOcrResponse?> { | ||
return WebClient.create(NCP_OCR_API_URL).post() | ||
.header("X-OCR-SECRET", ncpApiProperties.ocrKey) | ||
.header("Content-Type", "application/json") | ||
.bodyValue( | ||
mapOf( | ||
"version" to "v2", | ||
"requestId" to UUID.randomUUID(), | ||
"timestamp" to 0, | ||
"lang" to "ko", | ||
"images" to listOf( | ||
mapOf( | ||
"format" to "png", | ||
"name" to "string", | ||
"url" to imageUrl | ||
Comment on lines
+37
to
+45
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. map 말고 다른 방법은 없나ㅏㅏ???? |
||
) | ||
) | ||
) | ||
) | ||
.retrieve() | ||
.onStatus({ status -> status.is4xxClientError || status.is5xxServerError }) { | ||
it.bodyToMono(String::class.java) | ||
.map { body -> RuntimeException("NCP OCR API Error: $body") } | ||
} | ||
.bodyToMono(NcpOcrResponse::class.java) | ||
} | ||
|
||
companion object { | ||
private val NCP_OCR_API_URL = | ||
"https://79vlzuxe20.apigw.ntruss.com/custom/v1/31007/86118a3b7fc92492b026562882076481e99119cc9563879b4f0076cf8d7bff31/general" | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
안쓰면 지울까.?