-
Notifications
You must be signed in to change notification settings - Fork 2
/
fetch.py
129 lines (98 loc) · 3.19 KB
/
fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/python3
# SPDX-License-Identifier: MIT
from datetime import datetime
import feedparser
import re
import sqlite3
import sys
import time
MAX_ENTRIES_PER_FEED = 8
force = False
if len(sys.argv) > 1 and sys.argv[1] == '--force':
force = True
conn = sqlite3.connect('feeds.db')
conn.row_factory = sqlite3.Row
c = conn.cursor()
c.execute('SELECT * FROM feeds')
author_regexp = re.compile(r'[email protected] \((.*)\)$')
for feed in c.fetchall():
print(feed['name'])
etag = feed['etag']
modified = feed['modified']
if force:
etag = 0
modified = 0
url = feed['url']
try:
# etag and modified tell when we last checked so can get only newer
# posts
data = feedparser.parse(url, etag=etag, modified=modified)
if data.bozo and 'link' not in data.feed:
print(data.bozo)
print(data.bozo_exception)
print(data.feed)
continue
if 'etag' not in data:
data.etag = 0
if 'modified' not in data:
data.modified = 0
if 'status' not in data:
data.status = 200
# if 304 then no data fetched
if 'title' in data.feed:
title = data.feed.title
else:
title = feed['title']
if 'link' in data.feed:
link = data.feed.link
else:
link = feed['blog_url']
print(f'''
ETag: {data.etag}
Modified: {data.modified}
Status: {data.status}
Posts: {len(data.entries)}
''')
if data.status == 301: # permanent redirection
url = data.href
c.execute('''UPDATE feeds
SET etag=?, modified=?, url=?, title=?, blog_url=?
WHERE id=?''',
(data.etag, data.modified, url, title, link, feed['id'],))
# if we got all posts then drop all stored ones
if data.status == 200:
c.execute('DELETE FROM posts WHERE feed_id=?', (feed['id'],))
counter = 0
for post in data.entries:
if 'content' in post:
content = post.content[0].value
elif 'summary' in post:
content = post.summary
if 'author' not in post:
author = feed['name']
else:
author = post.author
# Blogger
author = author_regexp.sub(r'\1', author)
try:
published = datetime.fromtimestamp(
time.mktime(post.published_parsed[:8] + (-1,)))
except AttributeError:
published = datetime.fromtimestamp(
time.mktime(post.updated_parsed[:8] + (-1,)))
try:
c.execute('''INSERT INTO posts (feed_id, title, post, url,
author, published_date) VALUES (?, ?, ?, ?, ?, ?)''',
(feed['id'], post.title, content, post.link, author,
published))
except ValueError:
pass
counter += 1
if counter == MAX_ENTRIES_PER_FEED:
break
except RuntimeError:
print('failed')
pass
print(' ')
conn.commit()
conn.close()