-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathalimama.py
151 lines (121 loc) · 5.36 KB
/
alimama.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# coding=utf-8
import urllib
import urllib2
import cookielib
import re
import hashlib
import json
class Alimama():
def _prepare(self):
self._cookie = cookielib.CookieJar()
self._opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self._cookie))
def _get_request(self, url):
request = urllib2.Request(url)
request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)')
return request
def _fetch_html(self, request, post_data=None):
handler = None
if post_data:
handler = self._opener.open(request, urllib.urlencode(post_data))
else:
handler = self._opener.open(request)
return handler.url, handler.read()
def login(self, email, password):
self._prepare()
url, html = self._fetch_html(self._get_request('http://www.alimama.com'))
login_url = re.findall('<iframe.*?id="J_mmLoginIfr".*?src="(.*?)".*?></iframe>', html)[0]
url, html = self._fetch_html(self._get_request(login_url))
action = re.findall('<form.*?action=[\'"](.*?)[\'"]', html)[0]
inputs = re.findall('<input.*?name=[\'"](.*?)[\'"].*?value=[\'"](.*?)[\'"].*?>', html)
form = {}
for key, value in inputs:
form[key] = value
form['logname'] = email
form['originalLogpasswd'] = password
form['logpasswd'] = str(hashlib.md5(password).hexdigest()).lower()
url, html = self._fetch_html(self._get_request(action), form)
if 'success' in url:
return True
else:
return False
def get_top_information(self):
tops = {
# 热门词
'keywords': 'http://top.taobao.com/interface_v2.php?trid=TR_zongbang&f=json&n=100&up=false',
# 精品箱包
'bag': 'http://top.taobao.com/interface_v2.php?cat=50006842&n=100&f=json',
# 时尚女装
'woman_clothes': 'http://top.taobao.com/interface_v2.php?cat=16&n=100&f=json',
# 帅气男装
'man_clothes': 'http://top.taobao.com/interface_v2.php?cat=50006842&n=100&f=json',
}
top_infos = {}
for key in tops:
url, html = self._fetch_html(self._get_request(tops[key]))
print html
if str(html).startswith('var top_data='):
html = html[len('var top_data='):]
top_infos[key] = json.loads(html)
return top_infos
def get_taobao_promo_url(self, id):
url = 'http://u.alimama.com/union/spread/common/allCode.htm?specialType=item&auction_id=%s' % id
url, html = self._fetch_html(self._get_request(url))
return re.findall('var clickUrl = [\'"](.*?)[\'"];', html)[0]
def _clear_others(self, words):
return words.strip().replace('\n', '').replace('\r', '').replace('\t', '')
def search(self, keywords, page=1, pagesize=40):
url = 'http://u.alimama.com/union/spread/selfservice/merchandisePromotion.htm?cat=&mid=&searchType=0&page=%s&pagesize=%s&q=%s&_fmu.a._0.so=_commrate' % (page, pagesize, urllib.quote(keywords))
url, html = self._fetch_html(self._get_request(url))
# print html
products = []
trs = re.findall('<tr.*?>.*?</tr>', html, re.S)
for tr in trs:
tds = re.findall('<td.*?>.*?<\/td>', tr, re.S)
#print tds
if tds:
pid = self._clear_others(re.findall('<input.*?value="(.*?)"', tds[0], re.S)[0])
tmp = re.findall('<img.*?src="(.*?)".*?<a.*?>(.*?)<\/a>.*?<p.*?>(.*?)<span.*<\/p>', tds[1], re.S)[0]
image = self._clear_others(tmp[0].strip().replace('\t', ''))
title = self._clear_others(tmp[1])
saler = self._clear_others(tmp[2])
discount = self._clear_others(re.findall('<td.*?>(.*?)<\/td>', tds[2], re.S)[0])
price = self._clear_others(re.findall('<td.*?>(.*?)<\/td>', tds[3], re.S)[0])
promo_percent = self._clear_others(re.findall('<td.*?>(.*?)<\/td>', tds[4], re.S)[0])
products.append({
'pid': pid,
'image': image,
'title': title.replace('<span>', '').replace('</span>', ''),
'saler': saler,
'discount': discount,
'price': price,
'promo_percent': promo_percent
})
#print products
#print len(products)
#print html
tmp = re.findall('\{count:(.*?),size:(.*?),index:(.*?),goTo:false,pageCount:false\}', html, re.S)
if tmp:
tmp = tmp[0]
record_count = int(tmp[0])
page_size = int(tmp[1])
current_page = int(tmp[2])
page_sum = record_count/page_size
if record_count % page_size > 0:
page_sum += 1
else:
record_count = len(products)
page_size = pagesize
current_page = 1
page_sum = 1
return {
'current_page': current_page,
'page_sum': page_sum,
'page_size': page_size,
'record_count': record_count,
'data': products
}
ali = Alimama()
if ali.login('[email protected]', 'password'):
#print ali.get_taobao_promo_url(10449760543)
print ali.search('女装')
#ali.get_top_information()