From d18a068094cc12e13963529045c12afc78e58a8b Mon Sep 17 00:00:00 2001 From: beshiniii Date: Fri, 2 Apr 2021 20:39:16 +0530 Subject: [PATCH] Adding github_trending spider to scrapy app --- crawlerx_app/src/views/Dashboard.vue | 2 +- scrapy_app/scrapy_app/spider_common.py | 84 +++++++++++++++++++ .../scrapy_app/spiders/github_trending.py | 35 ++++++++ scrapy_app/scrapy_app/spiders/reddit.py | 81 +----------------- 4 files changed, 122 insertions(+), 80 deletions(-) create mode 100644 scrapy_app/scrapy_app/spider_common.py create mode 100644 scrapy_app/scrapy_app/spiders/github_trending.py diff --git a/crawlerx_app/src/views/Dashboard.vue b/crawlerx_app/src/views/Dashboard.vue index 8de0cdc..1e5ebb6 100644 --- a/crawlerx_app/src/views/Dashboard.vue +++ b/crawlerx_app/src/views/Dashboard.vue @@ -241,7 +241,7 @@ isOnMobile: false, projectOptions: [], selectedProject: null, - crawlerOptions: ["crawlerx", "stackoverflow", "wikipedia", "reddit"] + crawlerOptions: ["crawlerx", "stackoverflow", "wikipedia", "reddit", "github_trending"] } }, mounted() { diff --git a/scrapy_app/scrapy_app/spider_common.py b/scrapy_app/scrapy_app/spider_common.py new file mode 100644 index 0000000..3982948 --- /dev/null +++ b/scrapy_app/scrapy_app/spider_common.py @@ -0,0 +1,84 @@ +import re +from scrapy.selector import Selector + +@staticmethod +def extract_item(sels): + contents = [] + for i in sels: + content = re.sub(r'\s+', ' ', i.extract()) + if content != ' ': + contents.append(content) + return contents + + +def extract_items(self, sel, rules, item): + for nk, nv in rules.items(): + if nk in ('__use', '__list'): + continue + if nk not in item: + item[nk] = [] + if sel.css(nv): + item[nk] += self.extract_item(sel.css(nv)) + else: + item[nk] = [] + + +def traversal(self, sel, rules, item_class, item, items): + if item is None: + item = item_class() + if '__use' in rules: + if '__list' in rules: + unique_item = item_class() + self.extract_items(sel, rules, unique_item) + items.append(unique_item) + else: + self.extract_items(sel, rules, item) + else: + for nk, nv in rules.items(): + for i in sel.css(nk): + self.traversal(i, nv, item_class, item, items) + + +def traversal_dict(self, sel, rules, item_class, item, items, force_1_item): + item = {} + for k, v in rules.items(): + if type(v) != dict: + if k in self.keywords: + continue + if type(v) == list: + continue + self.deal_text(sel, item, force_1_item, k, v) + else: + item[k] = [] + for i in sel.css(k): + self.traversal_dict(i, v, item_class, item, item[k], force_1_item) + items.append(item) + + +def deal_text(self, sel, item, force_1_item, k, v): + if v.endswith('::text') and self.auto_join_text: + item[k] = ' '.join(self.extract_item(sel.css(v))) + else: + _items = self.extract_item(sel.css(v)) + if force_1_item: + if len(_items) >= 1: + item[k] = _items[0] + else: + item[k] = '' + else: + item[k] = _items + + +def depth_first_search(self, sel, rules, item_class, force_1_item): + if sel is None: + return [] + items = [] + if item_class != dict: + self.traversal(sel, rules, item_class, None, items) + else: + self.traversal_dict(sel, rules, item_class, None, items, force_1_item) + return items + + +def parse_with_rules(self, response, rules, item_class, force_1_item=False): + return self.depth_first_search(Selector(response), rules, item_class, force_1_item) diff --git a/scrapy_app/scrapy_app/spiders/github_trending.py b/scrapy_app/scrapy_app/spiders/github_trending.py new file mode 100644 index 0000000..91eb7de --- /dev/null +++ b/scrapy_app/scrapy_app/spiders/github_trending.py @@ -0,0 +1,35 @@ +import os +from scrapy.spiders import CrawlSpider +from scrapy_app.spider_common import * + + +class GithubTrendingSpider(CrawlSpider): + name = "github_trending" + + list_css_rules = { + '.repo-list-item': { + 'repo_name': '.repo-list-name a::attr(href)', + 'repo_meta': '.repo-list-meta::text', + } + } + + def __init__(self, *args, **kwargs): + self.url = kwargs.get('url') + self.domain = kwargs.get('domain') + self.start_urls = [self.url] + self.allowed_domains = [self.domain] + self.settings = kwargs.get('settings') + + super(GithubTrendingSpider, self).__init__(*args, **kwargs) + + def parse(self, response): + parsed_item = dict() + parsed_settings = dict(self.settings) + parsed_item['user_id'] = parsed_settings['user_id'] + parsed_item['project_name'] = parsed_settings['project_name'] + parsed_item['job_name'] = parsed_settings['job_name'] + parsed_item['unique_id'] = parsed_settings['unique_id'] + parsed_item['task_id'] = os.environ['SCRAPY_JOB'] + crawled_data = parse_with_rules(response, self.list_css_rules, dict) + parsed_item['data'] = crawled_data + yield parsed_item diff --git a/scrapy_app/scrapy_app/spiders/reddit.py b/scrapy_app/scrapy_app/spiders/reddit.py index a8b270b..1426832 100644 --- a/scrapy_app/scrapy_app/spiders/reddit.py +++ b/scrapy_app/scrapy_app/spiders/reddit.py @@ -1,8 +1,7 @@ -import re import os -from scrapy.selector import Selector from scrapy.spiders import CrawlSpider from scrapy.item import Item, Field +from scrapy_app.spider_common import * class RedditItem(Item): @@ -37,82 +36,6 @@ def __init__(self, *args, **kwargs): super(RedditSpider, self).__init__(*args, **kwargs) - @staticmethod - def extract_item(sels): - contents = [] - for i in sels: - content = re.sub(r'\s+', ' ', i.extract()) - if content != ' ': - contents.append(content) - return contents - - def extract_items(self, sel, rules, item): - for nk, nv in rules.items(): - if nk in ('__use', '__list'): - continue - if nk not in item: - item[nk] = [] - if sel.css(nv): - item[nk] += self.extract_item(sel.css(nv)) - else: - item[nk] = [] - - def traversal(self, sel, rules, item_class, item, items): - if item is None: - item = item_class() - if '__use' in rules: - if '__list' in rules: - unique_item = item_class() - self.extract_items(sel, rules, unique_item) - items.append(unique_item) - else: - self.extract_items(sel, rules, item) - else: - for nk, nv in rules.items(): - for i in sel.css(nk): - self.traversal(i, nv, item_class, item, items) - - def traversal_dict(self, sel, rules, item_class, item, items, force_1_item): - item = {} - for k, v in rules.items(): - if type(v) != dict: - if k in self.keywords: - continue - if type(v) == list: - continue - self.deal_text(sel, item, force_1_item, k, v) - else: - item[k] = [] - for i in sel.css(k): - self.traversal_dict(i, v, item_class, item, item[k], force_1_item) - items.append(item) - - def deal_text(self, sel, item, force_1_item, k, v): - if v.endswith('::text') and self.auto_join_text: - item[k] = ' '.join(self.extract_item(sel.css(v))) - else: - _items = self.extract_item(sel.css(v)) - if force_1_item: - if len(_items) >= 1: - item[k] = _items[0] - else: - item[k] = '' - else: - item[k] = _items - - def depth_first_search(self, sel, rules, item_class, force_1_item): - if sel is None: - return [] - items = [] - if item_class != dict: - self.traversal(sel, rules, item_class, None, items) - else: - self.traversal_dict(sel, rules, item_class, None, items, force_1_item) - return items - - def parse_with_rules(self, response, rules, item_class, force_1_item=False): - return self.depth_first_search(Selector(response), rules, item_class, force_1_item) - def parse(self, response): parsed_item = dict() parsed_settings = dict(self.settings) @@ -121,6 +44,6 @@ def parse(self, response): parsed_item['job_name'] = parsed_settings['job_name'] parsed_item['unique_id'] = parsed_settings['unique_id'] parsed_item['task_id'] = os.environ['SCRAPY_JOB'] - crawled_data = self.parse_with_rules(response, self.list_css_rules, dict) + crawled_data = parse_with_rules(response, self.list_css_rules, dict) parsed_item['data'] = crawled_data yield parsed_item