Merge pull request #46 from yashagarwal1999/master

PR Merged. Good Job
girlscriptjaipur · Jul 7, 2019 · 7b9f411 · 7b9f411
2 parents f58f9b5 + bdafc42
commit 7b9f411
Show file tree

Hide file tree

Showing 17 changed files with 510 additions and 0 deletions.
diff --git a/P13_ContentAggregator/P13_YashAgarwal/Scrape/politics.py b/P13_ContentAggregator/P13_YashAgarwal/Scrape/politics.py
@@ -0,0 +1,40 @@
+from bs4 import BeautifulSoup as bs
+import requests
+
+import createdb as db
+url="https://www.moneycontrol.com/news/politics/"
+page="moneycontrol"
+def scrape_money():
+	r=requests.get(url)
+
+	sp=bs(r.content,'html5lib')
+	head=sp.findAll('li',{'class':'clearfix'})
+	#article=sp.findAll('div',{'itemprop':'articleBody'})
+	j=0;
+
+	con=db.create()
+
+	if con is not None:
+		for i in head:
+			curr=con.cursor()		
+			headline=i.a['title']
+			#print(headline)
+			link=i.a['href']
+			#print(link)
+			#link=page+link
+			a=i.p.text
+			#print(a)
+
+			j+=1
+			curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,a))
+			#print(head+"\n" + link + "\n" + a+"\n")
+			#f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n")
+
+		con.commit()
+		curr.close()
+		return True
+	else:
+		return False
+	#f.close()
+if __name__ == '__main__':
+	scrape_money()
diff --git a/P13_ContentAggregator/P13_YashAgarwal/Scrape/reuters.py b/P13_ContentAggregator/P13_YashAgarwal/Scrape/reuters.py
@@ -0,0 +1,42 @@
+from bs4 import BeautifulSoup as bs
+import requests
+
+import createdb as db
+url="https://in.reuters.com/news/archive/technologyNews"
+page="reuters"
+u="https://in.reuters.com"
+def scrape_reuters():
+	r=requests.get(url)
+
+	sp=bs(r.content,'html5lib')
+	div=sp.findAll('div',{'class':'story-content'})
+	heads=sp.findAll('h3',{'class':'story-title'})
+	body=sp.findAll('p')
+	j=0;
+	#print(div[0].p.text)
+	con=db.create()
+	if con is not None:
+		for i in div:
+			curr=con.cursor()		
+			headline=heads[j].text.strip()
+
+			article=body[j].text
+			#print(headline+"\n")
+			#print(article)
+			j+=1
+			link=i.a['href']
+			link=u+link
+			curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,article))
+			#print(head+"\n" + link + "\n" + a+"\n")
+			#f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n")
+
+		con.commit()
+		curr.close()
+		return True
+	else:
+		return False
+	#f.close()
+
+
+if __name__ == '__main__':
+	scrape_reuters()
diff --git a/P13_ContentAggregator/P13_YashAgarwal/Scrape/scrape.py b/P13_ContentAggregator/P13_YashAgarwal/Scrape/scrape.py
@@ -0,0 +1,31 @@
+from bs4 import BeautifulSoup as bs
+import requests
+#from urllib.request import urlopen as uop
+import createdb as db
+myurl="https://www.newegg.com/global/in-en/Gaming-Laptops/SubCategory/ID-3365?Tid=1297731"
+source="newegg"
+def scrape_newegg():
+
+	r=requests.get(myurl)
+	s=bs(r.content,'html5lib')
+	container=s.findAll('div',{'class':'item-container'})
+
+	con=db.create()
+
+	for contain in container:
+		curr=con.cursor()
+		price=contain.find("li","price-current").text.strip()
+		brand=contain.find("div","item-branding").img["title"].strip()+"\n"+price
+		product=contain.find("a","item-title").text.strip()
+		url=contain.find('a',"item-title")
+		linkk=url['href']
+		#price.strip()
+
+
+		curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(source,linkk,brand,product))
+		#f.write(brand + "," + product.replace(",","|") + "," + price.replace(",",""))
+
+	con.commit()
+	curr.close()
+	#f.close()
+
diff --git a/P13_ContentAggregator/P13_YashAgarwal/Scrape/scrape_quora.py b/P13_ContentAggregator/P13_YashAgarwal/Scrape/scrape_quora.py
@@ -0,0 +1,47 @@
+from bs4 import BeautifulSoup as bs
+import requests
+#from urllib.request import urlopen as uop
+import createdb as db
+def scrape_quora():
+	url="https://www.quora.com/topic/Web-Development"
+	page="https://www.quora.com"
+	source="Quora"
+	source1="Quora"
+	r=requests.get(url)
+	#r=requests.get("https://www.quora.com/topic/Hollywood")
+	sp=bs(r.content,'html5lib')
+	i=1
+	ass=sp.findAll('a',attrs={'class':'question_link'})
+	"""filename="quora.txt"
+	f=open(filename,"w")"""
+	k=0
+	con=db.create()
+	if con is not None:
+		for Qlink in ass:
+			Qhref=Qlink['href']
+
+			FinalLink=page+Qhref
+			r1=requests.get(FinalLink)
+			sp1=bs(r1.content,'html5lib')
+			span1=sp1.findAll('p',attrs={'class':'ui_qtext_para u-ltr u-text-align--start'})
+
+
+			#for p in span1:
+			text1=span1[0].text
+			#print(Qhref+"\n")
+			#print(text1)
+
+			curr=con.cursor()
+			curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(source,FinalLink,Qhref,text1))
+
+		con.commit()
+		curr.close()
+		return True
+
+	else:
+		return False
+
+if __name__ == '__main__':
+	t=scrape_quora()	
+
+
diff --git a/P13_ContentAggregator/P13_YashAgarwal/Scrape/scrapenews.py b/P13_ContentAggregator/P13_YashAgarwal/Scrape/scrapenews.py
@@ -0,0 +1,34 @@
+from bs4 import BeautifulSoup as bs
+import requests
+from urllib.request import urlopen as uop
+import createdb as db
+url="https://inshorts.com/en/read"
+page="inshorts"
+p1="https://inshorts.com"
+def scrape_news():
+	r=requests.get(url)
+
+	sp=bs(r.content,'html5lib')
+	headline=sp.findAll('div',{'class':'news-card-title'})
+	article=sp.findAll('div',{'itemprop':'articleBody'})
+	j=0;
+	con=db.create()
+	if con is not None:
+		for i in headline:
+			curr=con.cursor()		
+			head=i.span.text
+			link=i.a['href']
+			link=p1+link
+			a=article[j].text
+			j+=1
+			curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,head,a))
+			#print(head+"\n" + link + "\n" + a+"\n")
+			#f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n")
+
+		con.commit()
+		curr.close()
+		return True
+	else:
+		return False
+	#f.close()
+
diff --git a/P13_ContentAggregator/P13_YashAgarwal/Scrape/space.py b/P13_ContentAggregator/P13_YashAgarwal/Scrape/space.py
@@ -0,0 +1,43 @@
+from bs4 import BeautifulSoup as bs
+import requests
+
+import createdb as db
+url="https://www.space.com/news"
+page="space"
+u="https://www.space.com"
+def scrape_space():
+	r=requests.get(url)
+
+	sp=bs(r.content,'html5lib')
+	div=sp.findAll('div',{'class':'content'})
+	heads=sp.findAll('h3',{'class':'article-name'})
+	body=sp.findAll('p',{'class':'synopsis'})
+	links=sp.findAll('a',{'class':'article-link'})
+	j=0;
+	#print(div[0].p.text)
+	con=db.create()
+	if con is not None:
+		for i in div:
+			curr=con.cursor()		
+			headline=heads[j].text
+			link=links[j]['href']
+			article=body[j].text
+			"""print(headline+"\n")
+			print(article)
+			print(link)"""
+			j+=1
+
+			curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,article))
+			#print(head+"\n" + link + "\n" + a+"\n")
+			#f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n")
+
+		con.commit()
+		curr.close()
+		return True
+	else:
+		return False
+	#f.close()
+
+
+if __name__ == '__main__':
+	scrape_space()
diff --git a/P13_ContentAggregator/P13_YashAgarwal/Scrape/sports.py b/P13_ContentAggregator/P13_YashAgarwal/Scrape/sports.py
@@ -0,0 +1,43 @@
+from bs4 import BeautifulSoup as bs
+import requests
+
+import createdb as db
+url="https://www.indiatoday.in/sport"
+page="indiatoday"
+u="https://www.indiatoday.in"
+def scrape_sports():
+	r=requests.get(url)
+
+	sp=bs(r.content,'html5lib')
+	div=sp.findAll('div',{'class':'detail'})
+	#heads=sp.findAll('h3',{'class':'story-title'})
+	body=sp.findAll('p')
+	j=0;
+	#print(div[0].p.text)
+	con=db.create()
+	if con is not None:
+		for i in div:
+			curr=con.cursor()		
+			headline=i.h2['title']
+			article=i.p.text
+			#article=body[j].text
+			#print(headline+"\n")
+			#print(article)
+			j+=1
+			link=i.a['href']
+			link=u+link
+			#print(link)
+			curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,article))
+			#print(head+"\n" + link + "\n" + a+"\n")
+			#f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n")
+
+		con.commit()
+		curr.close()
+		return True
+	else:
+		return False
+	#f.close()
+
+
+if __name__ == '__main__':
+	scrape_sports()
diff --git a/P13_ContentAggregator/P13_YashAgarwal/Scrapper.py b/P13_ContentAggregator/P13_YashAgarwal/Scrapper.py
@@ -0,0 +1,58 @@
+import createdb as db
+from Scrape import scrape,scrapenews,reuters,politics,sports,space
+import time
+
+source = ['inshorts','moneycontrol', 'reuters','indiatoday','space'] 
+name = ['Inshorts','MoneyControl','Reuters India','IndiaToday','Space']
+
+def start_scrapper():
+	if not scrapenews.scrape_news():
+		print("Error in fetching newsin short")
+	if not politics.scrape_money():
+		print("Error in fetching MOney Control")
+	if not reuters.scrape_reuters():
+		print("Error in fetching reuters")
+	if not sports.scrape_sports():
+		print("Error in fetching indiatoday")
+	if not space.scrape_space():
+		print("Error in fetching space.com")
+
+	print("Done Scrapping Check db")
+
+def getContent():
+	content = {}
+	conn = db.create()
+	c = conn.cursor()
+
+	for j,i in enumerate(source) :
+		z = c.execute("Select * from Scrapped_data where source='{}' order by rowid desc;".format(i));  
+		content[name[j]] = z.fetchall()
+		if content[name[j]] ==[]:
+			conn.close()
+			return None
+	conn.close()
+	return content
+
+def getContentForSource(s):
+	if s in source :
+		i = source.index(s)
+		content = {}
+		conn = db.create()
+		c = conn.cursor()
+		z = c.execute("Select * from Scrapped_data where source='{}' order by rowid desc;".format(s));  
+		content[name[i]] = z.fetchall()
+		if content[name[j]] ==[]:
+			conn.close()
+			return None
+		conn.close()
+		return content
+	else:
+		return None
+
+def scrapeStart():
+	while True:
+		start_scrapper()
+		time.sleep(3600) 
+
+if __name__ == '__main__':
+	start_scrapper()
diff --git a/...tAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-48.png b/...tAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-48.png
diff --git a/...tAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-52.png b/...tAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-52.png
diff --git a/...tAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-58.png b/...tAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-58.png
diff --git a/P13_ContentAggregator/P13_YashAgarwal/createdb.py b/P13_ContentAggregator/P13_YashAgarwal/createdb.py
@@ -0,0 +1,28 @@
+import sqlite3
+from sqlite3 import Error
+def create():
+	try:
+		con=sqlite3.connect('database_content.db')
+
+		table_structure="CREATE TABLE IF NOT EXISTS Scrapped_data(source text NOT NULL,link text NOT NULL,title text NOT NULL,Description text NOT NULL,CONSTRAINT PK PRIMARY KEY(source,link,title))"
+		if con is not None:
+			generate_table(con,table_structure)
+			return con
+		else:
+			print("Error in creating database")
+			return None
+
+	except Error as e:
+		print("Error in create function in createdb file\n"+e)
+		return None
+
+def generate_table(con,table_structure):
+	try:
+		curr=con.cursor()
+		curr.execute(table_structure)
+		#curr.execute('insert into Scrapped_data values("Ash","www.org","yash","this is")')
+		con.commit()
+	except Error as e:
+		print(e)	
+
+
diff --git a/P13_ContentAggregator/P13_YashAgarwal/database_content.db b/P13_ContentAggregator/P13_YashAgarwal/database_content.db