-
Notifications
You must be signed in to change notification settings - Fork 70
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #46 from yashagarwal1999/master
PR Merged. Good Job
- Loading branch information
Showing
17 changed files
with
510 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from bs4 import BeautifulSoup as bs | ||
import requests | ||
|
||
import createdb as db | ||
url="https://www.moneycontrol.com/news/politics/" | ||
page="moneycontrol" | ||
def scrape_money(): | ||
r=requests.get(url) | ||
|
||
sp=bs(r.content,'html5lib') | ||
head=sp.findAll('li',{'class':'clearfix'}) | ||
#article=sp.findAll('div',{'itemprop':'articleBody'}) | ||
j=0; | ||
|
||
con=db.create() | ||
|
||
if con is not None: | ||
for i in head: | ||
curr=con.cursor() | ||
headline=i.a['title'] | ||
#print(headline) | ||
link=i.a['href'] | ||
#print(link) | ||
#link=page+link | ||
a=i.p.text | ||
#print(a) | ||
|
||
j+=1 | ||
curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,a)) | ||
#print(head+"\n" + link + "\n" + a+"\n") | ||
#f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n") | ||
|
||
con.commit() | ||
curr.close() | ||
return True | ||
else: | ||
return False | ||
#f.close() | ||
if __name__ == '__main__': | ||
scrape_money() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from bs4 import BeautifulSoup as bs | ||
import requests | ||
|
||
import createdb as db | ||
url="https://in.reuters.com/news/archive/technologyNews" | ||
page="reuters" | ||
u="https://in.reuters.com" | ||
def scrape_reuters(): | ||
r=requests.get(url) | ||
|
||
sp=bs(r.content,'html5lib') | ||
div=sp.findAll('div',{'class':'story-content'}) | ||
heads=sp.findAll('h3',{'class':'story-title'}) | ||
body=sp.findAll('p') | ||
j=0; | ||
#print(div[0].p.text) | ||
con=db.create() | ||
if con is not None: | ||
for i in div: | ||
curr=con.cursor() | ||
headline=heads[j].text.strip() | ||
|
||
article=body[j].text | ||
#print(headline+"\n") | ||
#print(article) | ||
j+=1 | ||
link=i.a['href'] | ||
link=u+link | ||
curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,article)) | ||
#print(head+"\n" + link + "\n" + a+"\n") | ||
#f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n") | ||
|
||
con.commit() | ||
curr.close() | ||
return True | ||
else: | ||
return False | ||
#f.close() | ||
|
||
|
||
if __name__ == '__main__': | ||
scrape_reuters() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from bs4 import BeautifulSoup as bs | ||
import requests | ||
#from urllib.request import urlopen as uop | ||
import createdb as db | ||
myurl="https://www.newegg.com/global/in-en/Gaming-Laptops/SubCategory/ID-3365?Tid=1297731" | ||
source="newegg" | ||
def scrape_newegg(): | ||
|
||
r=requests.get(myurl) | ||
s=bs(r.content,'html5lib') | ||
container=s.findAll('div',{'class':'item-container'}) | ||
|
||
con=db.create() | ||
|
||
for contain in container: | ||
curr=con.cursor() | ||
price=contain.find("li","price-current").text.strip() | ||
brand=contain.find("div","item-branding").img["title"].strip()+"\n"+price | ||
product=contain.find("a","item-title").text.strip() | ||
url=contain.find('a',"item-title") | ||
linkk=url['href'] | ||
#price.strip() | ||
|
||
|
||
curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(source,linkk,brand,product)) | ||
#f.write(brand + "," + product.replace(",","|") + "," + price.replace(",","")) | ||
|
||
con.commit() | ||
curr.close() | ||
#f.close() | ||
|
47 changes: 47 additions & 0 deletions
47
P13_ContentAggregator/P13_YashAgarwal/Scrape/scrape_quora.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
from bs4 import BeautifulSoup as bs | ||
import requests | ||
#from urllib.request import urlopen as uop | ||
import createdb as db | ||
def scrape_quora(): | ||
url="https://www.quora.com/topic/Web-Development" | ||
page="https://www.quora.com" | ||
source="Quora" | ||
source1="Quora" | ||
r=requests.get(url) | ||
#r=requests.get("https://www.quora.com/topic/Hollywood") | ||
sp=bs(r.content,'html5lib') | ||
i=1 | ||
ass=sp.findAll('a',attrs={'class':'question_link'}) | ||
"""filename="quora.txt" | ||
f=open(filename,"w")""" | ||
k=0 | ||
con=db.create() | ||
if con is not None: | ||
for Qlink in ass: | ||
Qhref=Qlink['href'] | ||
|
||
FinalLink=page+Qhref | ||
r1=requests.get(FinalLink) | ||
sp1=bs(r1.content,'html5lib') | ||
span1=sp1.findAll('p',attrs={'class':'ui_qtext_para u-ltr u-text-align--start'}) | ||
|
||
|
||
#for p in span1: | ||
text1=span1[0].text | ||
#print(Qhref+"\n") | ||
#print(text1) | ||
|
||
curr=con.cursor() | ||
curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(source,FinalLink,Qhref,text1)) | ||
|
||
con.commit() | ||
curr.close() | ||
return True | ||
|
||
else: | ||
return False | ||
|
||
if __name__ == '__main__': | ||
t=scrape_quora() | ||
|
||
|
34 changes: 34 additions & 0 deletions
34
P13_ContentAggregator/P13_YashAgarwal/Scrape/scrapenews.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
from bs4 import BeautifulSoup as bs | ||
import requests | ||
from urllib.request import urlopen as uop | ||
import createdb as db | ||
url="https://inshorts.com/en/read" | ||
page="inshorts" | ||
p1="https://inshorts.com" | ||
def scrape_news(): | ||
r=requests.get(url) | ||
|
||
sp=bs(r.content,'html5lib') | ||
headline=sp.findAll('div',{'class':'news-card-title'}) | ||
article=sp.findAll('div',{'itemprop':'articleBody'}) | ||
j=0; | ||
con=db.create() | ||
if con is not None: | ||
for i in headline: | ||
curr=con.cursor() | ||
head=i.span.text | ||
link=i.a['href'] | ||
link=p1+link | ||
a=article[j].text | ||
j+=1 | ||
curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,head,a)) | ||
#print(head+"\n" + link + "\n" + a+"\n") | ||
#f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n") | ||
|
||
con.commit() | ||
curr.close() | ||
return True | ||
else: | ||
return False | ||
#f.close() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from bs4 import BeautifulSoup as bs | ||
import requests | ||
|
||
import createdb as db | ||
url="https://www.space.com/news" | ||
page="space" | ||
u="https://www.space.com" | ||
def scrape_space(): | ||
r=requests.get(url) | ||
|
||
sp=bs(r.content,'html5lib') | ||
div=sp.findAll('div',{'class':'content'}) | ||
heads=sp.findAll('h3',{'class':'article-name'}) | ||
body=sp.findAll('p',{'class':'synopsis'}) | ||
links=sp.findAll('a',{'class':'article-link'}) | ||
j=0; | ||
#print(div[0].p.text) | ||
con=db.create() | ||
if con is not None: | ||
for i in div: | ||
curr=con.cursor() | ||
headline=heads[j].text | ||
link=links[j]['href'] | ||
article=body[j].text | ||
"""print(headline+"\n") | ||
print(article) | ||
print(link)""" | ||
j+=1 | ||
|
||
curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,article)) | ||
#print(head+"\n" + link + "\n" + a+"\n") | ||
#f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n") | ||
|
||
con.commit() | ||
curr.close() | ||
return True | ||
else: | ||
return False | ||
#f.close() | ||
|
||
|
||
if __name__ == '__main__': | ||
scrape_space() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from bs4 import BeautifulSoup as bs | ||
import requests | ||
|
||
import createdb as db | ||
url="https://www.indiatoday.in/sport" | ||
page="indiatoday" | ||
u="https://www.indiatoday.in" | ||
def scrape_sports(): | ||
r=requests.get(url) | ||
|
||
sp=bs(r.content,'html5lib') | ||
div=sp.findAll('div',{'class':'detail'}) | ||
#heads=sp.findAll('h3',{'class':'story-title'}) | ||
body=sp.findAll('p') | ||
j=0; | ||
#print(div[0].p.text) | ||
con=db.create() | ||
if con is not None: | ||
for i in div: | ||
curr=con.cursor() | ||
headline=i.h2['title'] | ||
article=i.p.text | ||
#article=body[j].text | ||
#print(headline+"\n") | ||
#print(article) | ||
j+=1 | ||
link=i.a['href'] | ||
link=u+link | ||
#print(link) | ||
curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,article)) | ||
#print(head+"\n" + link + "\n" + a+"\n") | ||
#f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n") | ||
|
||
con.commit() | ||
curr.close() | ||
return True | ||
else: | ||
return False | ||
#f.close() | ||
|
||
|
||
if __name__ == '__main__': | ||
scrape_sports() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import createdb as db | ||
from Scrape import scrape,scrapenews,reuters,politics,sports,space | ||
import time | ||
|
||
source = ['inshorts','moneycontrol', 'reuters','indiatoday','space'] | ||
name = ['Inshorts','MoneyControl','Reuters India','IndiaToday','Space'] | ||
|
||
def start_scrapper(): | ||
if not scrapenews.scrape_news(): | ||
print("Error in fetching newsin short") | ||
if not politics.scrape_money(): | ||
print("Error in fetching MOney Control") | ||
if not reuters.scrape_reuters(): | ||
print("Error in fetching reuters") | ||
if not sports.scrape_sports(): | ||
print("Error in fetching indiatoday") | ||
if not space.scrape_space(): | ||
print("Error in fetching space.com") | ||
|
||
print("Done Scrapping Check db") | ||
|
||
def getContent(): | ||
content = {} | ||
conn = db.create() | ||
c = conn.cursor() | ||
|
||
for j,i in enumerate(source) : | ||
z = c.execute("Select * from Scrapped_data where source='{}' order by rowid desc;".format(i)); | ||
content[name[j]] = z.fetchall() | ||
if content[name[j]] ==[]: | ||
conn.close() | ||
return None | ||
conn.close() | ||
return content | ||
|
||
def getContentForSource(s): | ||
if s in source : | ||
i = source.index(s) | ||
content = {} | ||
conn = db.create() | ||
c = conn.cursor() | ||
z = c.execute("Select * from Scrapped_data where source='{}' order by rowid desc;".format(s)); | ||
content[name[i]] = z.fetchall() | ||
if content[name[j]] ==[]: | ||
conn.close() | ||
return None | ||
conn.close() | ||
return content | ||
else: | ||
return None | ||
|
||
def scrapeStart(): | ||
while True: | ||
start_scrapper() | ||
time.sleep(3600) | ||
|
||
if __name__ == '__main__': | ||
start_scrapper() |
Binary file added
BIN
+167 KB
...tAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-48.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+160 KB
...tAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-52.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+144 KB
...tAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-58.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import sqlite3 | ||
from sqlite3 import Error | ||
def create(): | ||
try: | ||
con=sqlite3.connect('database_content.db') | ||
|
||
table_structure="CREATE TABLE IF NOT EXISTS Scrapped_data(source text NOT NULL,link text NOT NULL,title text NOT NULL,Description text NOT NULL,CONSTRAINT PK PRIMARY KEY(source,link,title))" | ||
if con is not None: | ||
generate_table(con,table_structure) | ||
return con | ||
else: | ||
print("Error in creating database") | ||
return None | ||
|
||
except Error as e: | ||
print("Error in create function in createdb file\n"+e) | ||
return None | ||
|
||
def generate_table(con,table_structure): | ||
try: | ||
curr=con.cursor() | ||
curr.execute(table_structure) | ||
#curr.execute('insert into Scrapped_data values("Ash","www.org","yash","this is")') | ||
con.commit() | ||
except Error as e: | ||
print(e) | ||
|
||
|
Binary file not shown.
Oops, something went wrong.