-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfirst_crawl.py
33 lines (26 loc) · 1.14 KB
/
first_crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import asyncio
import aiohttp
from bs4 import BeautifulSoup
async def fetch_content(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
return await response.text()
async def main():
url = "https://movie.douban.com/cinema/later/beijing/"
init_page = await fetch_content(url)
init_soup = BeautifulSoup(init_page, 'lxml')
movie_names, urls_to_fetch, movie_dates = [], [], []
all_movies = init_soup.find('div', id="showing-soon")
for each_movie in all_movies.find_all('div', class_="item"):
all_a_tag = each_movie.find_all('a')
all_li_tag = each_movie.find_all('li')
movie_names.append(all_a_tag[1].text)
urls_to_fetch.append(all_a_tag[1]['href'])
movie_dates.append(all_li_tag[0].text)
tasks = [fetch_content(url) for url in urls_to_fetch]
pages = await asyncio.gather(*tasks)
for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):
soup_item = BeautifulSoup(page, 'lxml')
img_tag = soup_item.find('img')
print(movie_name, movie_date, img_tag['src'])
asyncio.run(main())