Skip to content

Commit

Permalink
Save raw content of posts in db (#215)
Browse files Browse the repository at this point in the history
* Add column in post table for raw post content

* Add raw content param to `PostPayload`

* Save raw content when parsing post contents
  • Loading branch information
msasikanth authored Jan 10, 2024
1 parent c12fbb4 commit e93aadc
Show file tree
Hide file tree
Showing 8 changed files with 49 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ data class PostPayload(
val title: String,
val link: String,
val description: String,
val rawContent: String?,
val imageUrl: String?,
val date: Long,
val commentsLink: String?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ internal object AtomContentParser : ContentParser() {
var title: String? = null
var link: String? = null
var content: String? = null
var rawContent: String? = null
var date: String? = null
var image: String? = null

Expand All @@ -113,7 +114,7 @@ internal object AtomContentParser : ContentParser() {
}
}
TAG_CONTENT -> {
val rawContent = readTagText(tagName, parser)
rawContent = readTagText(tagName, parser).trimIndent()
KsoupHtmlParser(
handler =
HtmlContentParser {
Expand Down Expand Up @@ -144,8 +145,9 @@ internal object AtomContentParser : ContentParser() {

return PostPayload(
title = FeedParser.cleanText(title, decodeUrlEncoding = true).orEmpty(),
description = FeedParser.cleanTextCompact(content, decodeUrlEncoding = true).orEmpty(),
link = FeedParser.cleanText(link)!!,
description = FeedParser.cleanTextCompact(content, decodeUrlEncoding = true).orEmpty(),
rawContent = rawContent,
imageUrl = FeedParser.safeUrl(hostLink, image),
date = postPubDateInMillis ?: Clock.System.now().toEpochMilliseconds(),
commentsLink = null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ internal object RssContentParser : ContentParser() {
var title: String? = null
var link: String? = null
var description: String? = null
var rawContent: String? = null
var date: String? = null
var image: String? = null
var commentsLink: String? = null
Expand All @@ -114,6 +115,7 @@ internal object RssContentParser : ContentParser() {
}
name == TAG_DESCRIPTION || name == TAG_CONTENT_ENCODED -> {
description = readTagText(name, parser)
rawContent = description.trimIndent()
}
name == TAG_PUB_DATE -> {
date = readTagText(name, parser)
Expand Down Expand Up @@ -149,8 +151,9 @@ internal object RssContentParser : ContentParser() {

return PostPayload(
title = FeedParser.cleanText(title, decodeUrlEncoding = true).orEmpty(),
description = FeedParser.cleanTextCompact(description, decodeUrlEncoding = true).orEmpty(),
link = FeedParser.cleanText(link)!!,
description = FeedParser.cleanTextCompact(description, decodeUrlEncoding = true).orEmpty(),
rawContent = rawContent,
imageUrl = FeedParser.safeUrl(hostLink, image),
date = postPubDateInMillis ?: Clock.System.now().toEpochMilliseconds(),
commentsLink = commentsLink?.trim()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ class RssRepository(
link = post.link,
commnetsLink = post.commentsLink,
feedLink = feedPayload.link,
rawContent = post.rawContent
)
}
}
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,17 @@ CREATE TABLE post(
bookmarked INTEGER AS Boolean NOT NULL DEFAULT 0,
commentsLink TEXT DEFAULT NULL,
read INTEGER AS Boolean NOT NULL DEFAULT 0,
rawContent TEXT,
FOREIGN KEY(feedLink) REFERENCES feed(link) ON DELETE CASCADE
);

CREATE INDEX post_feed_link_index ON post(feedLink);

upsert:
INSERT INTO post(title, description, imageUrl, date, feedLink, link, commentsLink)
VALUES (:title, :description, :imageUrl, :date, :feedLink, :link, :commnetsLink)
INSERT INTO post(title, description, rawContent, imageUrl, date, feedLink, link, commentsLink)
VALUES (:title, :description, :rawContent, :imageUrl, :date, :feedLink, :link, :commnetsLink)
ON CONFLICT(link) DO
UPDATE SET title = excluded.title, description = excluded.description, imageUrl = excluded.imageUrl, date = excluded.date;
UPDATE SET title = excluded.title, description = excluded.description, rawContent = excluded.rawContent, imageUrl = excluded.imageUrl, date = excluded.date;

count:
SELECT COUNT(*) FROM post
Expand Down
1 change: 1 addition & 0 deletions shared/src/commonMain/sqldelight/migrations/8.sqm
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE post ADD COLUMN rawContent TEXT;
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class FeedParserTest {
title = "Post with image",
link = "https://example.com/first-post",
description = "First post description.",
rawContent = "First post description.",
imageUrl = "https://example.com/first-post-media-url",
date = 1685005200000,
commentsLink = null
Expand All @@ -50,6 +51,7 @@ class FeedParserTest {
title = "Post without image",
link = "https://example.com/second-post",
description = "Second post description.",
rawContent = "Second post description.",
imageUrl = null,
date = 1684999800000,
commentsLink = null
Expand All @@ -58,6 +60,7 @@ class FeedParserTest {
title = "Podcast post",
link = "https://example.com/third-post",
description = "Third post description.",
rawContent = "Third post description.",
imageUrl = null,
date = 1684924200000,
commentsLink = null
Expand All @@ -66,6 +69,7 @@ class FeedParserTest {
title = "Post with enclosure image",
link = "https://example.com/fourth-post",
description = "Fourth post description.",
rawContent = "Fourth post description.",
imageUrl = "https://example.com/enclosure-image",
date = 1684924200000,
commentsLink = null
Expand All @@ -74,6 +78,12 @@ class FeedParserTest {
title = "Post with description and encoded content",
link = "https://example.com/fifth-post",
description = "Fourth post description in HTML syntax.",
rawContent =
"""
<p>Fourth post description in HTML syntax.</p>
<img src="https://example.com/encoded-image" alt="encoded image" />
"""
.trimIndent(),
imageUrl = "https://example.com/encoded-image",
date = 1684924200000,
commentsLink = null
Expand All @@ -82,6 +92,7 @@ class FeedParserTest {
title = "Post with relative path image",
link = "https://example.com/post-with-relative-image",
description = "Relative image post description.",
rawContent = "Relative image post description.",
imageUrl = "https://example.com/relative-media-url",
date = 1685005200000,
commentsLink = null
Expand All @@ -90,6 +101,7 @@ class FeedParserTest {
title = "Post with comments",
link = "https://example.com/post-with-comments",
description = "Really long post with comments.",
rawContent = "Really long post with comments.",
imageUrl = null,
date = 1685005200000,
commentsLink = "https://example/post-with-comments/comments"
Expand Down Expand Up @@ -120,6 +132,12 @@ class FeedParserTest {
title = "Post with image",
link = "https://example.com/first-post",
description = "Post summary with an image.",
rawContent =
"""
<img alt="First Image" src="https://example.com/image.jpg" />
<p>Post summary with an image.</p>
"""
.trimIndent(),
imageUrl = "https://example.com/image.jpg",
date = 1685008800000,
commentsLink = null
Expand All @@ -128,6 +146,11 @@ class FeedParserTest {
title = "Second post",
link = "https://example.com/second-post",
description = "Post summary of the second post.",
rawContent =
"""
<p>Post summary of the second post.</p>
"""
.trimIndent(),
imageUrl = null,
date = 1684917000000,
commentsLink = null
Expand All @@ -136,6 +159,11 @@ class FeedParserTest {
title = "Post without image",
link = "https://example.com/third-post",
description = "Post summary of the third post. click here.",
rawContent =
"""
<p>Post summary of the third post. <a href="https://example.com/hyperlink" >click here</a>.</p>
"""
.trimIndent(),
imageUrl = null,
date = 1684936800000,
commentsLink = null
Expand All @@ -144,6 +172,12 @@ class FeedParserTest {
title = "Post with relative image",
link = "https://example.com/relative-image-post",
description = "Post summary with an image.",
rawContent =
"""
<img alt="Relative Image" src="/resources/image.jpg" />
<p>Post summary with an image.</p>
"""
.trimIndent(),
imageUrl = "https://example.com/resources/image.jpg",
date = 1685008800000,
commentsLink = null
Expand Down

0 comments on commit e93aadc

Please sign in to comment.