-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_mars.py
110 lines (89 loc) · 3.56 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# ## HW#13 Web Scraping and Document Databases
# ## Mission to Mars
#Import dependencies
from bs4 import BeautifulSoup
from splinter import Browser
import pandas as pd
import requests
import time
import re
import pymongo
def init_brower():
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
return Browser('chrome', **executable_path, headless=False)
def scrape():
browser = init_brower()
#create scraped_mars_data dictionary that we can insert into mongo
scraped_mars_data = {}
###NASA MARS NEWS
mars_news = 'https://mars.nasa.gov/news/'
browser.visit(mars_news)
time.sleep(2)
html=browser.html
#Create a BeautifulSoup object and parse html
News_soup = BeautifulSoup(html, 'html.parser')
#Extract latest news title and paragraph
news_title= News_soup.find('div', class_='content_title').get_text()
news_paragraph=News_soup.find('div', class_='rollover_description_inner').get_text()
time.sleep(2)
#Add news_title and news_paragraph to dictionary
scraped_mars_data["news1"] = news_title
scraped_mars_data["news2"]= news_paragraph
### JPL MARS SPACE IMAGE
JPL_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(JPL_url)
#Click through the pages to reach the link containing the high res jpg image
time.sleep(2)
browser.click_link_by_partial_text('FULL IMAGE')
time.sleep(2)
browser.click_link_by_partial_text('more info')
time.sleep(2)
browser.click_link_by_partial_text('.jpg')
#Retrieve image url
html=browser.html
JPL_soup=BeautifulSoup(html,'html.parser')
featured_image_url=JPL_soup.find('img').get('src')
#Add image to the dictionary
scraped_mars_data["image"] = featured_image_url
### MARS WEATHER
weather_url='https://twitter.com/marswxreport?lang=en'
html=requests.get(weather_url)
weather_soup = BeautifulSoup(html.text, 'html.parser')
mars_weather=weather_soup.find_all(string=re.compile("Sol"), class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text")[0].text
#Add weather to the dictionary
scraped_mars_data["weather"] = mars_weather
### MARS PLANET PROFILE TABLE
facts_url='https://space-facts.com/mars/'
table=pd.read_html(facts_url)
table_df=table[0]
table_df=table_df.rename(columns={0:'Mars Planet Profile', 1: ''})
table_df.set_index('Mars Planet Profile', inplace=True)
table_df
#Convert table to html table
table=table_df.to_html()
table
facts_table=table.replace('\n','')
facts_table
# #Add table to dictionary
scraped_mars_data["table"] = facts_table
### MARS HEMISPHERES IMAGES FROM USGS ASTROGEOLOGY SITE
hemisphere_url='https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemisphere_url)
html=browser.html
hemisphere_soup = BeautifulSoup(html, 'html.parser')
#Create a dictionary containing hemisphere titles and images urls
hemisphere_image_urls=[]
hemisphere_dict={'title':[], 'img_url':[]}
x=hemisphere_soup.find_all('h3')
for i in x:
y=i.get_text()
title=y.strip('Enhanced')
browser.click_link_by_partial_text(y)
url=browser.find_link_by_partial_href('download')['href']
hemisphere_dict={'title':title, 'img_url':url}
hemisphere_image_urls.append(hemisphere_dict)
browser.visit(hemisphere_url)
#print(hemisphere_image_urls)
#Add hemisphere images to dictionary
scraped_mars_data["hemispheres"] = hemisphere_image_urls
return scraped_mars_data