Skip to content

Commit

Permalink
Adding github_trending spider to scrapy app
Browse files Browse the repository at this point in the history
  • Loading branch information
beshiniii committed Apr 2, 2021
1 parent 5408c22 commit d18a068
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 80 deletions.
2 changes: 1 addition & 1 deletion crawlerx_app/src/views/Dashboard.vue
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@
isOnMobile: false,
projectOptions: [],
selectedProject: null,
crawlerOptions: ["crawlerx", "stackoverflow", "wikipedia", "reddit"]
crawlerOptions: ["crawlerx", "stackoverflow", "wikipedia", "reddit", "github_trending"]
}
},
mounted() {
Expand Down
84 changes: 84 additions & 0 deletions scrapy_app/scrapy_app/spider_common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import re
from scrapy.selector import Selector

@staticmethod
def extract_item(sels):
contents = []
for i in sels:
content = re.sub(r'\s+', ' ', i.extract())
if content != ' ':
contents.append(content)
return contents


def extract_items(self, sel, rules, item):
for nk, nv in rules.items():
if nk in ('__use', '__list'):
continue
if nk not in item:
item[nk] = []
if sel.css(nv):
item[nk] += self.extract_item(sel.css(nv))
else:
item[nk] = []


def traversal(self, sel, rules, item_class, item, items):
if item is None:
item = item_class()
if '__use' in rules:
if '__list' in rules:
unique_item = item_class()
self.extract_items(sel, rules, unique_item)
items.append(unique_item)
else:
self.extract_items(sel, rules, item)
else:
for nk, nv in rules.items():
for i in sel.css(nk):
self.traversal(i, nv, item_class, item, items)


def traversal_dict(self, sel, rules, item_class, item, items, force_1_item):
item = {}
for k, v in rules.items():
if type(v) != dict:
if k in self.keywords:
continue
if type(v) == list:
continue
self.deal_text(sel, item, force_1_item, k, v)
else:
item[k] = []
for i in sel.css(k):
self.traversal_dict(i, v, item_class, item, item[k], force_1_item)
items.append(item)


def deal_text(self, sel, item, force_1_item, k, v):
if v.endswith('::text') and self.auto_join_text:
item[k] = ' '.join(self.extract_item(sel.css(v)))
else:
_items = self.extract_item(sel.css(v))
if force_1_item:
if len(_items) >= 1:
item[k] = _items[0]
else:
item[k] = ''
else:
item[k] = _items


def depth_first_search(self, sel, rules, item_class, force_1_item):
if sel is None:
return []
items = []
if item_class != dict:
self.traversal(sel, rules, item_class, None, items)
else:
self.traversal_dict(sel, rules, item_class, None, items, force_1_item)
return items


def parse_with_rules(self, response, rules, item_class, force_1_item=False):
return self.depth_first_search(Selector(response), rules, item_class, force_1_item)
35 changes: 35 additions & 0 deletions scrapy_app/scrapy_app/spiders/github_trending.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import os
from scrapy.spiders import CrawlSpider
from scrapy_app.spider_common import *


class GithubTrendingSpider(CrawlSpider):
name = "github_trending"

list_css_rules = {
'.repo-list-item': {
'repo_name': '.repo-list-name a::attr(href)',
'repo_meta': '.repo-list-meta::text',
}
}

def __init__(self, *args, **kwargs):
self.url = kwargs.get('url')
self.domain = kwargs.get('domain')
self.start_urls = [self.url]
self.allowed_domains = [self.domain]
self.settings = kwargs.get('settings')

super(GithubTrendingSpider, self).__init__(*args, **kwargs)

def parse(self, response):
parsed_item = dict()
parsed_settings = dict(self.settings)
parsed_item['user_id'] = parsed_settings['user_id']
parsed_item['project_name'] = parsed_settings['project_name']
parsed_item['job_name'] = parsed_settings['job_name']
parsed_item['unique_id'] = parsed_settings['unique_id']
parsed_item['task_id'] = os.environ['SCRAPY_JOB']
crawled_data = parse_with_rules(response, self.list_css_rules, dict)
parsed_item['data'] = crawled_data
yield parsed_item
81 changes: 2 additions & 79 deletions scrapy_app/scrapy_app/spiders/reddit.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import re
import os
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider
from scrapy.item import Item, Field
from scrapy_app.spider_common import *


class RedditItem(Item):
Expand Down Expand Up @@ -37,82 +36,6 @@ def __init__(self, *args, **kwargs):

super(RedditSpider, self).__init__(*args, **kwargs)

@staticmethod
def extract_item(sels):
contents = []
for i in sels:
content = re.sub(r'\s+', ' ', i.extract())
if content != ' ':
contents.append(content)
return contents

def extract_items(self, sel, rules, item):
for nk, nv in rules.items():
if nk in ('__use', '__list'):
continue
if nk not in item:
item[nk] = []
if sel.css(nv):
item[nk] += self.extract_item(sel.css(nv))
else:
item[nk] = []

def traversal(self, sel, rules, item_class, item, items):
if item is None:
item = item_class()
if '__use' in rules:
if '__list' in rules:
unique_item = item_class()
self.extract_items(sel, rules, unique_item)
items.append(unique_item)
else:
self.extract_items(sel, rules, item)
else:
for nk, nv in rules.items():
for i in sel.css(nk):
self.traversal(i, nv, item_class, item, items)

def traversal_dict(self, sel, rules, item_class, item, items, force_1_item):
item = {}
for k, v in rules.items():
if type(v) != dict:
if k in self.keywords:
continue
if type(v) == list:
continue
self.deal_text(sel, item, force_1_item, k, v)
else:
item[k] = []
for i in sel.css(k):
self.traversal_dict(i, v, item_class, item, item[k], force_1_item)
items.append(item)

def deal_text(self, sel, item, force_1_item, k, v):
if v.endswith('::text') and self.auto_join_text:
item[k] = ' '.join(self.extract_item(sel.css(v)))
else:
_items = self.extract_item(sel.css(v))
if force_1_item:
if len(_items) >= 1:
item[k] = _items[0]
else:
item[k] = ''
else:
item[k] = _items

def depth_first_search(self, sel, rules, item_class, force_1_item):
if sel is None:
return []
items = []
if item_class != dict:
self.traversal(sel, rules, item_class, None, items)
else:
self.traversal_dict(sel, rules, item_class, None, items, force_1_item)
return items

def parse_with_rules(self, response, rules, item_class, force_1_item=False):
return self.depth_first_search(Selector(response), rules, item_class, force_1_item)

def parse(self, response):
parsed_item = dict()
parsed_settings = dict(self.settings)
Expand All @@ -121,6 +44,6 @@ def parse(self, response):
parsed_item['job_name'] = parsed_settings['job_name']
parsed_item['unique_id'] = parsed_settings['unique_id']
parsed_item['task_id'] = os.environ['SCRAPY_JOB']
crawled_data = self.parse_with_rules(response, self.list_css_rules, dict)
crawled_data = parse_with_rules(response, self.list_css_rules, dict)
parsed_item['data'] = crawled_data
yield parsed_item

0 comments on commit d18a068

Please sign in to comment.