Skip to content

Commit

Permalink
Merge pull request #46 from yashagarwal1999/master
Browse files Browse the repository at this point in the history
PR Merged. Good Job
  • Loading branch information
gdsoumya authored Jul 7, 2019
2 parents f58f9b5 + bdafc42 commit 7b9f411
Show file tree
Hide file tree
Showing 17 changed files with 510 additions and 0 deletions.
40 changes: 40 additions & 0 deletions P13_ContentAggregator/P13_YashAgarwal/Scrape/politics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from bs4 import BeautifulSoup as bs
import requests

import createdb as db
url="https://www.moneycontrol.com/news/politics/"
page="moneycontrol"
def scrape_money():
r=requests.get(url)

sp=bs(r.content,'html5lib')
head=sp.findAll('li',{'class':'clearfix'})
#article=sp.findAll('div',{'itemprop':'articleBody'})
j=0;

con=db.create()

if con is not None:
for i in head:
curr=con.cursor()
headline=i.a['title']
#print(headline)
link=i.a['href']
#print(link)
#link=page+link
a=i.p.text
#print(a)

j+=1
curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,a))
#print(head+"\n" + link + "\n" + a+"\n")
#f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n")

con.commit()
curr.close()
return True
else:
return False
#f.close()
if __name__ == '__main__':
scrape_money()
42 changes: 42 additions & 0 deletions P13_ContentAggregator/P13_YashAgarwal/Scrape/reuters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from bs4 import BeautifulSoup as bs
import requests

import createdb as db
url="https://in.reuters.com/news/archive/technologyNews"
page="reuters"
u="https://in.reuters.com"
def scrape_reuters():
r=requests.get(url)

sp=bs(r.content,'html5lib')
div=sp.findAll('div',{'class':'story-content'})
heads=sp.findAll('h3',{'class':'story-title'})
body=sp.findAll('p')
j=0;
#print(div[0].p.text)
con=db.create()
if con is not None:
for i in div:
curr=con.cursor()
headline=heads[j].text.strip()

article=body[j].text
#print(headline+"\n")
#print(article)
j+=1
link=i.a['href']
link=u+link
curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,article))
#print(head+"\n" + link + "\n" + a+"\n")
#f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n")

con.commit()
curr.close()
return True
else:
return False
#f.close()


if __name__ == '__main__':
scrape_reuters()
31 changes: 31 additions & 0 deletions P13_ContentAggregator/P13_YashAgarwal/Scrape/scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from bs4 import BeautifulSoup as bs
import requests
#from urllib.request import urlopen as uop
import createdb as db
myurl="https://www.newegg.com/global/in-en/Gaming-Laptops/SubCategory/ID-3365?Tid=1297731"
source="newegg"
def scrape_newegg():

r=requests.get(myurl)
s=bs(r.content,'html5lib')
container=s.findAll('div',{'class':'item-container'})

con=db.create()

for contain in container:
curr=con.cursor()
price=contain.find("li","price-current").text.strip()
brand=contain.find("div","item-branding").img["title"].strip()+"\n"+price
product=contain.find("a","item-title").text.strip()
url=contain.find('a',"item-title")
linkk=url['href']
#price.strip()


curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(source,linkk,brand,product))
#f.write(brand + "," + product.replace(",","|") + "," + price.replace(",",""))

con.commit()
curr.close()
#f.close()

47 changes: 47 additions & 0 deletions P13_ContentAggregator/P13_YashAgarwal/Scrape/scrape_quora.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from bs4 import BeautifulSoup as bs
import requests
#from urllib.request import urlopen as uop
import createdb as db
def scrape_quora():
url="https://www.quora.com/topic/Web-Development"
page="https://www.quora.com"
source="Quora"
source1="Quora"
r=requests.get(url)
#r=requests.get("https://www.quora.com/topic/Hollywood")
sp=bs(r.content,'html5lib')
i=1
ass=sp.findAll('a',attrs={'class':'question_link'})
"""filename="quora.txt"
f=open(filename,"w")"""
k=0
con=db.create()
if con is not None:
for Qlink in ass:
Qhref=Qlink['href']

FinalLink=page+Qhref
r1=requests.get(FinalLink)
sp1=bs(r1.content,'html5lib')
span1=sp1.findAll('p',attrs={'class':'ui_qtext_para u-ltr u-text-align--start'})


#for p in span1:
text1=span1[0].text
#print(Qhref+"\n")
#print(text1)

curr=con.cursor()
curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(source,FinalLink,Qhref,text1))

con.commit()
curr.close()
return True

else:
return False

if __name__ == '__main__':
t=scrape_quora()


34 changes: 34 additions & 0 deletions P13_ContentAggregator/P13_YashAgarwal/Scrape/scrapenews.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from bs4 import BeautifulSoup as bs
import requests
from urllib.request import urlopen as uop
import createdb as db
url="https://inshorts.com/en/read"
page="inshorts"
p1="https://inshorts.com"
def scrape_news():
r=requests.get(url)

sp=bs(r.content,'html5lib')
headline=sp.findAll('div',{'class':'news-card-title'})
article=sp.findAll('div',{'itemprop':'articleBody'})
j=0;
con=db.create()
if con is not None:
for i in headline:
curr=con.cursor()
head=i.span.text
link=i.a['href']
link=p1+link
a=article[j].text
j+=1
curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,head,a))
#print(head+"\n" + link + "\n" + a+"\n")
#f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n")

con.commit()
curr.close()
return True
else:
return False
#f.close()

43 changes: 43 additions & 0 deletions P13_ContentAggregator/P13_YashAgarwal/Scrape/space.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from bs4 import BeautifulSoup as bs
import requests

import createdb as db
url="https://www.space.com/news"
page="space"
u="https://www.space.com"
def scrape_space():
r=requests.get(url)

sp=bs(r.content,'html5lib')
div=sp.findAll('div',{'class':'content'})
heads=sp.findAll('h3',{'class':'article-name'})
body=sp.findAll('p',{'class':'synopsis'})
links=sp.findAll('a',{'class':'article-link'})
j=0;
#print(div[0].p.text)
con=db.create()
if con is not None:
for i in div:
curr=con.cursor()
headline=heads[j].text
link=links[j]['href']
article=body[j].text
"""print(headline+"\n")
print(article)
print(link)"""
j+=1

curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,article))
#print(head+"\n" + link + "\n" + a+"\n")
#f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n")

con.commit()
curr.close()
return True
else:
return False
#f.close()


if __name__ == '__main__':
scrape_space()
43 changes: 43 additions & 0 deletions P13_ContentAggregator/P13_YashAgarwal/Scrape/sports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from bs4 import BeautifulSoup as bs
import requests

import createdb as db
url="https://www.indiatoday.in/sport"
page="indiatoday"
u="https://www.indiatoday.in"
def scrape_sports():
r=requests.get(url)

sp=bs(r.content,'html5lib')
div=sp.findAll('div',{'class':'detail'})
#heads=sp.findAll('h3',{'class':'story-title'})
body=sp.findAll('p')
j=0;
#print(div[0].p.text)
con=db.create()
if con is not None:
for i in div:
curr=con.cursor()
headline=i.h2['title']
article=i.p.text
#article=body[j].text
#print(headline+"\n")
#print(article)
j+=1
link=i.a['href']
link=u+link
#print(link)
curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,article))
#print(head+"\n" + link + "\n" + a+"\n")
#f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n")

con.commit()
curr.close()
return True
else:
return False
#f.close()


if __name__ == '__main__':
scrape_sports()
58 changes: 58 additions & 0 deletions P13_ContentAggregator/P13_YashAgarwal/Scrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import createdb as db
from Scrape import scrape,scrapenews,reuters,politics,sports,space
import time

source = ['inshorts','moneycontrol', 'reuters','indiatoday','space']
name = ['Inshorts','MoneyControl','Reuters India','IndiaToday','Space']

def start_scrapper():
if not scrapenews.scrape_news():
print("Error in fetching newsin short")
if not politics.scrape_money():
print("Error in fetching MOney Control")
if not reuters.scrape_reuters():
print("Error in fetching reuters")
if not sports.scrape_sports():
print("Error in fetching indiatoday")
if not space.scrape_space():
print("Error in fetching space.com")

print("Done Scrapping Check db")

def getContent():
content = {}
conn = db.create()
c = conn.cursor()

for j,i in enumerate(source) :
z = c.execute("Select * from Scrapped_data where source='{}' order by rowid desc;".format(i));
content[name[j]] = z.fetchall()
if content[name[j]] ==[]:
conn.close()
return None
conn.close()
return content

def getContentForSource(s):
if s in source :
i = source.index(s)
content = {}
conn = db.create()
c = conn.cursor()
z = c.execute("Select * from Scrapped_data where source='{}' order by rowid desc;".format(s));
content[name[i]] = z.fetchall()
if content[name[j]] ==[]:
conn.close()
return None
conn.close()
return content
else:
return None

def scrapeStart():
while True:
start_scrapper()
time.sleep(3600)

if __name__ == '__main__':
start_scrapper()
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
28 changes: 28 additions & 0 deletions P13_ContentAggregator/P13_YashAgarwal/createdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import sqlite3
from sqlite3 import Error
def create():
try:
con=sqlite3.connect('database_content.db')

table_structure="CREATE TABLE IF NOT EXISTS Scrapped_data(source text NOT NULL,link text NOT NULL,title text NOT NULL,Description text NOT NULL,CONSTRAINT PK PRIMARY KEY(source,link,title))"
if con is not None:
generate_table(con,table_structure)
return con
else:
print("Error in creating database")
return None

except Error as e:
print("Error in create function in createdb file\n"+e)
return None

def generate_table(con,table_structure):
try:
curr=con.cursor()
curr.execute(table_structure)
#curr.execute('insert into Scrapped_data values("Ash","www.org","yash","this is")')
con.commit()
except Error as e:
print(e)


Binary file not shown.
Loading

0 comments on commit 7b9f411

Please sign in to comment.