Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

web scraping part #4

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
251 changes: 251 additions & 0 deletions debug.log

Large diffs are not rendered by default.

193 changes: 193 additions & 0 deletions initial_scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random
from random import choice
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def initial_scrape():
l1, l2 = [], []
URL = "https://artsandculture.google.com/category/art-movement?tab=pop&date=1000"
req = requests.get(URL)
soup = bs(req.text, "lxml")
td = soup.find_all("script")[3].contents[0][336:]
l = td.split(",") # l1:art movement type name,l2:art movement type link
for r in l:
if r == "null":
l.remove(r)
if r.startswith('"//'):
l.remove(r)
if r == "[]":
l.remove(r)
if r.endswith('items"'):
l.remove(r)

for j in range(len(l)):
if '"' in l[j]:
result = l[j].index('"', 0, -1)
if l[j][result + 1].isupper() == True:
index1, index2 = l[j].index('"', 0, -1), -1
l1.append(l[j][index1 + 1 : index2])
l1, indl = l1[55:218], []
for y in range(len(l1)):
if len(l1[y]) == 1:
indl.append(l1[y])
indl.append(l1[y + 1])
for t in indl:
if t in l1:
l1.remove(t)
d1 = {}
for q in l1:
search = q.lower()
search = search.replace(" ", "-")
search = search.replace("'", "-")
for g in l:
if search in g:
l2.append(g)
temp = []
for x in l2:
if x not in temp and x.startswith('"/entity'):
temp.append(x)
l2 = temp
for q in l1:
search = q.lower()
search = search.replace(" ", "-")
search = search.replace("'", "-")
for g in l2:
if search in g:
d1[q] = g
del d1["Romanesque art"]
del d1["Bronze Age"]
del d1["Early Christian art and architecture"]
print(d1)
types = list(d1.keys())
# fw=open("/Users/Lasitha/Documents/art_tweets/scraped_data/tweeted.txt", "w+")
# for t in types:
# fw.write(t+':'+'\n')
# fw.close()
length1 = len(types)
n = random.randint(0, length1)
typ = types[n]
url_for_type = d1[typ]
return [typ, url_for_type]


def scrape_out_arts():
a = initial_scrape()
typ = a[0]
url_for_type = "https://artsandculture.google.com" + a[1][1:-1]
driver = webdriver.Chrome("C:/Users/Lasitha/Documents/chromedriver.exe")
driver.delete_all_cookies()
driver.maximize_window()
driver.get(url_for_type)
wait = WebDriverWait(driver, 100)
element = wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "s6J3Hd")))
parentElement = driver.find_element_by_class_name("s6J3Hd")
element = parentElement.find_elements_by_tag_name("div")[3]
print(element)
driver.execute_script("arguments[0].click();", element)
element.click()
time.sleep(3)
url1 = driver.current_url
print(url1)
driver.quit()
req1 = requests.get(url1)
soup1 = bs(req1.text, "lxml")
td1 = soup1.find_all("script")[3].contents[0]
le = td1.split(",")
d2, m = {}, []
for a in le:
if a.startswith('"/asset') and a not in m:
m.append(a)
indx = le.index(a)
d2[le[indx - 3]] = a[1:-1]
art = list(d2.keys())
length2 = len(art)
f1 = open("/Users/Lasitha/Documents/art_tweets/scraped_data/tweeted.txt", "r")
lines = f1.readlines()
for line in lines:
if typ in line:
oldline = line
line1 = line
ge1, ge2 = line1.index(":"), line1.index("\n")
if ge2 - ge1 == 1:
choiceindex = []
else:
choiceindex = line1[ge1 + 1 : ge2].split(" ")
for i in range(0, len(choiceindex)):
choiceindex[i] = int(choiceindex[i])

i = choice([i for i in range(0, length2) if i not in choiceindex])
url_for_art = d2[art[i]]
line1 = line1[:-1] + " " + str(i) + "\n"
new_lines = ""
for line in lines:
if oldline in line:
new_lines += line1
else:
new_lines += line
f1.close()
fe = open("/Users/Lasitha/Documents/art_tweets/scraped_data/tweeted.txt", "w")
fe.write(new_lines)
fe.close()
return [i, url_for_art, typ]


def art_details():
b = scrape_out_arts()
print(b)
url_for_art = "https://artsandculture.google.com" + b[1] + "?hl=en"
print(url_for_art)
req4 = requests.get(url_for_art)
soup4 = bs(req4.text, "lxml")
im = soup4.find_all("img")[0]["src"] # img_link
print(im)
td1 = soup4.find("section", class_="rw8Th QwmCXd")
td3 = td1.find_all("li")
d3 = {}
for i in range(len(td3)):
if "Title" in td3[i].find_all("span")[0].text:
ind = td3[i].text.index(": ")
k = td3[i].text[ind + 1 :][1:]
d3["Title"] = k
if "Creator" in td3[i].find_all("span")[0].text:
ind = td3[i].text.index(": ")
k = td3[i].text[ind + 1 :][1:]
d3["Creator"] = k
if "Date" in td3[i].find_all("span")[0].text:
ind = td3[i].text.index(": ")
k = td3[i].text[ind + 1 :][1:]
d3["Date"] = k
if "Location" in td3[i].find_all("span")[0].text:
ind = td3[i].text.index(": ")
k = td3[i].text[ind + 1 :][1:]
d3["Location"] = k
if "Medium" in td3[i].find_all("span")[0].text:
ind = td3[i].text.index(": ")
k = td3[i].text[ind + 1 :][1:]
d3["Medium"] = k
if "External Link" in td3[i].find_all("span")[0].text:
a = td3[i].find_all("a")
d3["External Link"] = a[0]["href"]
d3["Type"] = b[2]

f7 = open(
"/Users/Lasitha/Documents/art_tweets/scraped_data/current_tweet.txt", "w+"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@LasithaE first of all, congrats on opening the PR! Two comments: next time onwards, please open PRs from a different branch. Also if you check link 179 (and other similar lines throughout), how do you plan to make sure the files exist on the user's PC?

)
for i in d3:
f7.write(i + ":" + d3[i] + "\n")
f7.close()

response = requests.get("http:" + im)
if response.status_code == 200:
with open(
"/Users/Lasitha/Documents/art_tweets/scraped_data/imgs/current.jpg", "wb"
) as f:
f.write(response.content)


art_details()
47 changes: 47 additions & 0 deletions item_details.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import requests
from bs4 import BeautifulSoup as bs

url2 = "https://artsandculture.google.com/asset/anne-in-a-striped-dress-fairfield-porter/TAHUTWNOPxdkYA"
req2 = requests.get(url2)
soup2 = bs(req2.text, "lxml")
im = "https:" + soup2.find_all("img", class_="pmK5Xc")[0]["src"] # img_link
td1 = soup2.find("section", class_="rw8Th QwmCXd")
td3 = td1.find_all("li")
# print(td3)
d3 = {}
for i in range(len(td3)):
if "Title" in td3[i].find_all("span")[0].text:
ind = td3[i].text.index(": ")
k = td3[i].text[ind + 1 :][1:]
d3["Title"] = k
if "Creator" in td3[i].find_all("span")[0].text:
ind = td3[i].text.index(": ")
k = td3[i].text[ind + 1 :][1:]
d3["Creator"] = k
if "Location" in td3[i].find_all("span")[0].text:
ind = td3[i].text.index(": ")
k = td3[i].text[ind + 1 :][1:]
d3["Location"] = k
if "Medium" in td3[i].find_all("span")[0].text:
ind = td3[i].text.index(": ")
k = td3[i].text[ind + 1 :][1:]
d3["Medium"] = k
if "External Link" in td3[i].find_all("span")[0].text:
a = td3[i].find_all("a")
d3["External Link"] = a[0]["href"]

print(d3)
# d3['Date created']=td3[2].contents[1]
# d3['Location']=td3[3].contents[1]
# d3['Medium']=td3[7].find_all('a')[0].text
f7 = open("/Users/Lasitha/Documents/art_tweets/scraped_data/current_tweet.txt", "w+")
for i in d3:
f7.write(i + ":" + d3[i] + "\n")
f7.close()
response = requests.get(im)
if response.status_code == 200:
with open(
"/Users/Lasitha/Documents/art_tweets/scraped_data/imgs/current.jpg", "wb"
) as f:
f.write(response.content)
print(d3)
29 changes: 29 additions & 0 deletions scrape_out_arts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
from selenium.webdriver.support.ui import WebDriverWait

url_for_type = "https://artsandculture.google.com/entity/abstract-expressionism/m012yb9?categoryid%5C%5Cu003dart-movement"
driver = webdriver.Chrome("C:/Users/Lasitha/Documents/chromedriver.exe")
driver.get(url_for_type)
parentElement = driver.find_element_by_class_name("s6J3Hd")
element = parentElement.find_elements_by_tag_name("div")[3]
element.click()
time.sleep(3)
url1 = driver.current_url
print(url1)
driver.quit()
# url1 = 'https://artsandculture.google.com/entity/abstract-expressionism/m012yb9?categoryid%5C%5Cu003dart-movement&date=1965'#driver.current_url

req1 = requests.get(url1)
soup1 = bs(req1.text, "lxml")
td1 = soup1.find_all("script")[3].contents[0]
le = td1.split(",")
d2, m = {}, []
for a in le:
if a.startswith('"/asset') and a not in m:
m.append(a)
indx = le.index(a)
d2[le[indx - 3]] = a[1:-1]
print(list(d2.keys()))
6 changes: 6 additions & 0 deletions scraped_data/current_tweet.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Title:En blomstereng nordp�
Creator:Harald Sohlberg
Date:1905
External Link:http://www.digitaltmuseum.no/things/nm/NMK-B/NG.M.00692
Medium:Lerret, Olje p� lerret
Type:Neo-romanticism
Binary file added scraped_data/imgs/current.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
116 changes: 116 additions & 0 deletions scraped_data/tweeted.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
Abstract expressionism:
Abstract impressionism:
Academic art:
Aestheticism:
American Impressionism:
American modernism:
American Realims:
American Renaissance:
Amsterdam Impressionism:
Antwerp school:
Art Deco:
Art Nouveau:
Arte Povera:
Arts and Crafts movement:
Ashcan School:
Barbizon school:
Baroque:
Bauhaus style:
Biomorphism:
Bolognese School:
Caravaggisti:
Classicism:
Cleveland School:
Color field:
Conceptual art:
Constructivism:
Contemporary art:
Cubism:
Dada:
De Stijl:
Der Blaue Reiter:
Die Br�cke:
Dutch and Flemish Renaissance painting:
Dutch Golden Age:
Early renaissance:
Expressionism:
Fauvism:
Florentine painting:
Fluxus:
Folk art:
French Renaissance:
Futurism:
Geometric abstraction:
German Expressionism:
German Renaissance:
Gothic art:
Hague School:
Harlem Renaissance:
Heidelberg School:
High Renaissance:
Hudson River School:
Hyperrealism:
Impressionism:
Italian Renaissance:
Japonisme:
Kinetic art:
Land art:
Les Nabis:
Luminism:
Magical Realism:
Mannerism:
Metaphysical art:
Milanese School:
Minimalism:
Modern art:
Modernism:
Na�ve art:
Naturalism:
Nazarene movement:
Neo-expressionism:
Neo-Impressionism:
Neo-romanticism:
Neoclassicism:
New Objectivity:
Nihonga:
Northern Renaissance:
Norwich School:
Nouveau r�alisme:
Op art:
Outsider art:
Paduan School:
Pennsylvania Impressionism:
Photorealism:
Pop art:
Post-Impressionism:
Postminimalism:
Postmodernism:
Pre-Raphaelite Brotherhood:
Primitivism:
Public art:
Realism:
Regionalism:
Renaissance:
Rococo:
Romanticism:
School of Ferrara:
School of Fontainebleau:
School of Paris:
Section d'Or:
Shin-hanga:
Sienese School:
Social realism:
Socialist realism:
Spanish Eclecticism:
Spanish Renaissance:
Street art:
Sturm und Drang:
Suprematism:
Surrealism:
Symbolism:
Synthetism:
Tonalism:
Ukiyo-e:
Utrecht Caravaggism:
Venetian painting:
Young British Artists:
Loading