Skip to content

Commit

Permalink
"Just works on my pc" version
Browse files Browse the repository at this point in the history
  • Loading branch information
Sardor committed Nov 28, 2014
1 parent 9413804 commit b369d29
Show file tree
Hide file tree
Showing 8 changed files with 37 additions and 25 deletions.
4 changes: 2 additions & 2 deletions scrapy.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
# http://doc.scrapy.org/en/latest/topics/scrapyd.html

[settings]
default = uzbdictspider.settings
default = uzbdict.settings

[deploy]
#url = http://localhost:6800/
project = uzbdictspider
project = uzbdict
File renamed without changes.
File renamed without changes.
1 change: 1 addition & 0 deletions uzbdictspider/pipelines.py → uzbdict/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@

class UzbdictspiderPipeline(object):
def process_item(self, item, spider):
print item
return item
4 changes: 2 additions & 2 deletions uzbdictspider/settings.py → uzbdict/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

BOT_NAME = 'uzbdictspider'

SPIDER_MODULES = ['uzbdictspider.spiders']
# NEWSPIDER_MODULE = 'uzbdictspider.spiders'
SPIDER_MODULES = ['uzbdict.spiders']
NEWSPIDER_MODULE = 'uzbdict.spiders'

USER_AGENT = 'uzbdictspider (+http://www.uzbek-dictionary.com)'
File renamed without changes.
32 changes: 32 additions & 0 deletions uzbdict/spiders/uzbdictspider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# -*- coding: utf-8 -*-
import scrapy
import string
from uzbdict.items import DictionaryItem
import html2text


class UzbDictSpider(scrapy.Spider):
name = "uzbdictspider"
allowed_domains = ["www.uzbek-dictionary.com"]
# start_urls = (
# 'http://www.uzbek-dictionary.com/trans.php?trans_text=e&trans_lang=uz_en&trans_number=10000',
# )

def start_requests(self):
start_urls = ['http://www.uzbek-dictionary.com/trans.php?trans_text=%s&trans_lang=uz_en&trans_number=10000' % i \
for i in string.lowercase]
return [scrapy.Request(url=start_url) for start_url in start_urls]

def parse(self, response):
hxs = scrapy.Selector(response)
item = DictionaryItem()
letters = hxs.xpath('/html/body/div[@class="frame"]/a/text()').extract()
separators = hxs.xpath('/html/body/div[@class="frame"]/span[@class="seperator"]/text()').extract()
translations = hxs.xpath('/html/body/div[@class="frame"]/div[@class="meaning_wrapper"]').extract()
# print len(letters), len(separators), len(translations)
# print letters, len(letters)
# print separators, len(separators)
# print translations, len(translations)
trans_items = [html2text.html2text(t).strip() for t in translations]
dictionary = dict(zip(letters, trans_items))
print dictionary
21 changes: 0 additions & 21 deletions uzbdictspider/spiders/uzbdictspider.py

This file was deleted.

0 comments on commit b369d29

Please sign in to comment.