diff --git a/P13_ContentAggregator/P13_YashAgarwal/Scrape/politics.py b/P13_ContentAggregator/P13_YashAgarwal/Scrape/politics.py new file mode 100644 index 00000000..f15d41b8 --- /dev/null +++ b/P13_ContentAggregator/P13_YashAgarwal/Scrape/politics.py @@ -0,0 +1,40 @@ +from bs4 import BeautifulSoup as bs +import requests + +import createdb as db +url="https://www.moneycontrol.com/news/politics/" +page="moneycontrol" +def scrape_money(): + r=requests.get(url) + + sp=bs(r.content,'html5lib') + head=sp.findAll('li',{'class':'clearfix'}) + #article=sp.findAll('div',{'itemprop':'articleBody'}) + j=0; + + con=db.create() + + if con is not None: + for i in head: + curr=con.cursor() + headline=i.a['title'] + #print(headline) + link=i.a['href'] + #print(link) + #link=page+link + a=i.p.text + #print(a) + + j+=1 + curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,a)) + #print(head+"\n" + link + "\n" + a+"\n") + #f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n") + + con.commit() + curr.close() + return True + else: + return False + #f.close() +if __name__ == '__main__': + scrape_money() diff --git a/P13_ContentAggregator/P13_YashAgarwal/Scrape/reuters.py b/P13_ContentAggregator/P13_YashAgarwal/Scrape/reuters.py new file mode 100644 index 00000000..79092ea3 --- /dev/null +++ b/P13_ContentAggregator/P13_YashAgarwal/Scrape/reuters.py @@ -0,0 +1,42 @@ +from bs4 import BeautifulSoup as bs +import requests + +import createdb as db +url="https://in.reuters.com/news/archive/technologyNews" +page="reuters" +u="https://in.reuters.com" +def scrape_reuters(): + r=requests.get(url) + + sp=bs(r.content,'html5lib') + div=sp.findAll('div',{'class':'story-content'}) + heads=sp.findAll('h3',{'class':'story-title'}) + body=sp.findAll('p') + j=0; + #print(div[0].p.text) + con=db.create() + if con is not None: + for i in div: + curr=con.cursor() + headline=heads[j].text.strip() + + article=body[j].text + #print(headline+"\n") + #print(article) + j+=1 + link=i.a['href'] + link=u+link + curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,article)) + #print(head+"\n" + link + "\n" + a+"\n") + #f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n") + + con.commit() + curr.close() + return True + else: + return False + #f.close() + + +if __name__ == '__main__': + scrape_reuters() diff --git a/P13_ContentAggregator/P13_YashAgarwal/Scrape/scrape.py b/P13_ContentAggregator/P13_YashAgarwal/Scrape/scrape.py new file mode 100644 index 00000000..24056406 --- /dev/null +++ b/P13_ContentAggregator/P13_YashAgarwal/Scrape/scrape.py @@ -0,0 +1,31 @@ +from bs4 import BeautifulSoup as bs +import requests +#from urllib.request import urlopen as uop +import createdb as db +myurl="https://www.newegg.com/global/in-en/Gaming-Laptops/SubCategory/ID-3365?Tid=1297731" +source="newegg" +def scrape_newegg(): + + r=requests.get(myurl) + s=bs(r.content,'html5lib') + container=s.findAll('div',{'class':'item-container'}) + + con=db.create() + + for contain in container: + curr=con.cursor() + price=contain.find("li","price-current").text.strip() + brand=contain.find("div","item-branding").img["title"].strip()+"\n"+price + product=contain.find("a","item-title").text.strip() + url=contain.find('a',"item-title") + linkk=url['href'] + #price.strip() + + + curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(source,linkk,brand,product)) + #f.write(brand + "," + product.replace(",","|") + "," + price.replace(",","")) + + con.commit() + curr.close() + #f.close() + diff --git a/P13_ContentAggregator/P13_YashAgarwal/Scrape/scrape_quora.py b/P13_ContentAggregator/P13_YashAgarwal/Scrape/scrape_quora.py new file mode 100644 index 00000000..68c2b0b6 --- /dev/null +++ b/P13_ContentAggregator/P13_YashAgarwal/Scrape/scrape_quora.py @@ -0,0 +1,47 @@ +from bs4 import BeautifulSoup as bs +import requests +#from urllib.request import urlopen as uop +import createdb as db +def scrape_quora(): + url="https://www.quora.com/topic/Web-Development" + page="https://www.quora.com" + source="Quora" + source1="Quora" + r=requests.get(url) + #r=requests.get("https://www.quora.com/topic/Hollywood") + sp=bs(r.content,'html5lib') + i=1 + ass=sp.findAll('a',attrs={'class':'question_link'}) + """filename="quora.txt" + f=open(filename,"w")""" + k=0 + con=db.create() + if con is not None: + for Qlink in ass: + Qhref=Qlink['href'] + + FinalLink=page+Qhref + r1=requests.get(FinalLink) + sp1=bs(r1.content,'html5lib') + span1=sp1.findAll('p',attrs={'class':'ui_qtext_para u-ltr u-text-align--start'}) + + + #for p in span1: + text1=span1[0].text + #print(Qhref+"\n") + #print(text1) + + curr=con.cursor() + curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(source,FinalLink,Qhref,text1)) + + con.commit() + curr.close() + return True + + else: + return False + +if __name__ == '__main__': + t=scrape_quora() + + diff --git a/P13_ContentAggregator/P13_YashAgarwal/Scrape/scrapenews.py b/P13_ContentAggregator/P13_YashAgarwal/Scrape/scrapenews.py new file mode 100644 index 00000000..c44cfdf9 --- /dev/null +++ b/P13_ContentAggregator/P13_YashAgarwal/Scrape/scrapenews.py @@ -0,0 +1,34 @@ +from bs4 import BeautifulSoup as bs +import requests +from urllib.request import urlopen as uop +import createdb as db +url="https://inshorts.com/en/read" +page="inshorts" +p1="https://inshorts.com" +def scrape_news(): + r=requests.get(url) + + sp=bs(r.content,'html5lib') + headline=sp.findAll('div',{'class':'news-card-title'}) + article=sp.findAll('div',{'itemprop':'articleBody'}) + j=0; + con=db.create() + if con is not None: + for i in headline: + curr=con.cursor() + head=i.span.text + link=i.a['href'] + link=p1+link + a=article[j].text + j+=1 + curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,head,a)) + #print(head+"\n" + link + "\n" + a+"\n") + #f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n") + + con.commit() + curr.close() + return True + else: + return False + #f.close() + diff --git a/P13_ContentAggregator/P13_YashAgarwal/Scrape/space.py b/P13_ContentAggregator/P13_YashAgarwal/Scrape/space.py new file mode 100644 index 00000000..60203fde --- /dev/null +++ b/P13_ContentAggregator/P13_YashAgarwal/Scrape/space.py @@ -0,0 +1,43 @@ +from bs4 import BeautifulSoup as bs +import requests + +import createdb as db +url="https://www.space.com/news" +page="space" +u="https://www.space.com" +def scrape_space(): + r=requests.get(url) + + sp=bs(r.content,'html5lib') + div=sp.findAll('div',{'class':'content'}) + heads=sp.findAll('h3',{'class':'article-name'}) + body=sp.findAll('p',{'class':'synopsis'}) + links=sp.findAll('a',{'class':'article-link'}) + j=0; + #print(div[0].p.text) + con=db.create() + if con is not None: + for i in div: + curr=con.cursor() + headline=heads[j].text + link=links[j]['href'] + article=body[j].text + """print(headline+"\n") + print(article) + print(link)""" + j+=1 + + curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,article)) + #print(head+"\n" + link + "\n" + a+"\n") + #f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n") + + con.commit() + curr.close() + return True + else: + return False + #f.close() + + +if __name__ == '__main__': + scrape_space() diff --git a/P13_ContentAggregator/P13_YashAgarwal/Scrape/sports.py b/P13_ContentAggregator/P13_YashAgarwal/Scrape/sports.py new file mode 100644 index 00000000..ee9bfaa1 --- /dev/null +++ b/P13_ContentAggregator/P13_YashAgarwal/Scrape/sports.py @@ -0,0 +1,43 @@ +from bs4 import BeautifulSoup as bs +import requests + +import createdb as db +url="https://www.indiatoday.in/sport" +page="indiatoday" +u="https://www.indiatoday.in" +def scrape_sports(): + r=requests.get(url) + + sp=bs(r.content,'html5lib') + div=sp.findAll('div',{'class':'detail'}) + #heads=sp.findAll('h3',{'class':'story-title'}) + body=sp.findAll('p') + j=0; + #print(div[0].p.text) + con=db.create() + if con is not None: + for i in div: + curr=con.cursor() + headline=i.h2['title'] + article=i.p.text + #article=body[j].text + #print(headline+"\n") + #print(article) + j+=1 + link=i.a['href'] + link=u+link + #print(link) + curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,article)) + #print(head+"\n" + link + "\n" + a+"\n") + #f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n") + + con.commit() + curr.close() + return True + else: + return False + #f.close() + + +if __name__ == '__main__': + scrape_sports() diff --git a/P13_ContentAggregator/P13_YashAgarwal/Scrapper.py b/P13_ContentAggregator/P13_YashAgarwal/Scrapper.py new file mode 100644 index 00000000..cd865bad --- /dev/null +++ b/P13_ContentAggregator/P13_YashAgarwal/Scrapper.py @@ -0,0 +1,58 @@ +import createdb as db +from Scrape import scrape,scrapenews,reuters,politics,sports,space +import time + +source = ['inshorts','moneycontrol', 'reuters','indiatoday','space'] +name = ['Inshorts','MoneyControl','Reuters India','IndiaToday','Space'] + +def start_scrapper(): + if not scrapenews.scrape_news(): + print("Error in fetching newsin short") + if not politics.scrape_money(): + print("Error in fetching MOney Control") + if not reuters.scrape_reuters(): + print("Error in fetching reuters") + if not sports.scrape_sports(): + print("Error in fetching indiatoday") + if not space.scrape_space(): + print("Error in fetching space.com") + + print("Done Scrapping Check db") + +def getContent(): + content = {} + conn = db.create() + c = conn.cursor() + + for j,i in enumerate(source) : + z = c.execute("Select * from Scrapped_data where source='{}' order by rowid desc;".format(i)); + content[name[j]] = z.fetchall() + if content[name[j]] ==[]: + conn.close() + return None + conn.close() + return content + +def getContentForSource(s): + if s in source : + i = source.index(s) + content = {} + conn = db.create() + c = conn.cursor() + z = c.execute("Select * from Scrapped_data where source='{}' order by rowid desc;".format(s)); + content[name[i]] = z.fetchall() + if content[name[j]] ==[]: + conn.close() + return None + conn.close() + return content + else: + return None + +def scrapeStart(): + while True: + start_scrapper() + time.sleep(3600) + +if __name__ == '__main__': + start_scrapper() diff --git a/P13_ContentAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-48.png b/P13_ContentAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-48.png new file mode 100644 index 00000000..d8e2f627 Binary files /dev/null and b/P13_ContentAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-48.png differ diff --git a/P13_ContentAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-52.png b/P13_ContentAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-52.png new file mode 100644 index 00000000..7b0043fe Binary files /dev/null and b/P13_ContentAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-52.png differ diff --git a/P13_ContentAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-58.png b/P13_ContentAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-58.png new file mode 100644 index 00000000..0d2fdbe9 Binary files /dev/null and b/P13_ContentAggregator/P13_YashAgarwal/Screenshots/Screenshot from 2019-07-07 20-19-58.png differ diff --git a/P13_ContentAggregator/P13_YashAgarwal/createdb.py b/P13_ContentAggregator/P13_YashAgarwal/createdb.py new file mode 100644 index 00000000..7e23f137 --- /dev/null +++ b/P13_ContentAggregator/P13_YashAgarwal/createdb.py @@ -0,0 +1,28 @@ +import sqlite3 +from sqlite3 import Error +def create(): + try: + con=sqlite3.connect('database_content.db') + + table_structure="CREATE TABLE IF NOT EXISTS Scrapped_data(source text NOT NULL,link text NOT NULL,title text NOT NULL,Description text NOT NULL,CONSTRAINT PK PRIMARY KEY(source,link,title))" + if con is not None: + generate_table(con,table_structure) + return con + else: + print("Error in creating database") + return None + + except Error as e: + print("Error in create function in createdb file\n"+e) + return None + +def generate_table(con,table_structure): + try: + curr=con.cursor() + curr.execute(table_structure) + #curr.execute('insert into Scrapped_data values("Ash","www.org","yash","this is")') + con.commit() + except Error as e: + print(e) + + diff --git a/P13_ContentAggregator/P13_YashAgarwal/database_content.db b/P13_ContentAggregator/P13_YashAgarwal/database_content.db new file mode 100644 index 00000000..cce661bf Binary files /dev/null and b/P13_ContentAggregator/P13_YashAgarwal/database_content.db differ diff --git a/P13_ContentAggregator/P13_YashAgarwal/my_server.py b/P13_ContentAggregator/P13_YashAgarwal/my_server.py new file mode 100644 index 00000000..9e5952fc --- /dev/null +++ b/P13_ContentAggregator/P13_YashAgarwal/my_server.py @@ -0,0 +1,31 @@ +from flask import Flask, render_template, redirect +import createdb as db +import Scrapper as s +import threading + +app = Flask(__name__) + +@app.route('/', methods = ["GET"]) +def home(): + content = s.getContent() + if content is None: + return "Still fetching data.Try again !" + return render_template('index.html', content = content) + +@app.route('/readmore/', methods = ["GET"]) +def readmore(source): + content = s.getContentForSource(source) + if content is None: + return redirect('/404') + return render_template('readmore.html', content = content) + + +if __name__ == '__main__': + + + thread = threading.Thread(target=s.scrapeStart) + thread.daemon = True + thread.start() + + + app.run() diff --git a/P13_ContentAggregator/P13_YashAgarwal/static/style.css b/P13_ContentAggregator/P13_YashAgarwal/static/style.css new file mode 100644 index 00000000..6bf46aaf --- /dev/null +++ b/P13_ContentAggregator/P13_YashAgarwal/static/style.css @@ -0,0 +1,71 @@ +* { + margin: 0px; + padding: 0px; +} +h1{ + position: fixed; + cursor: default; + margin: 0; + padding: 1vw; + background: black; + text-align: center; + text-transform: uppercase; + font-size: 4vw; + font-family: sans-serif; + color: white; + font-weight: bolder; + width: 100%; +} +.content{ + display: flex; + flex-wrap: wrap; + justify-content: space-around; + padding-top: 7vw; +} +.content h2{ + font-family: monospace; + font-size: 2vw; + font-weight: bolder; + margin: 1vw 1vw 2vw 0vw; +} +.contenta{ + display: block; + text-decoration: none; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + font-size: 1.4vw; + color: black; + font-family: sans-serif; + padding-bottom: 0.3vw; + border-bottom: 0.1vw solid #8080803d; + margin-top: -0.7vw; + padding-bottom: 0.6vw; +} +.contenta:hover{ + text-decoration: underline; +} +.data{ + width: 35%; + margin: 2vw; + text-overflow: ellipsis; +} +.readmore{ + text-decoration: none; + color: #cf3030; + font-family: sans-serif; + font-weight: bold; + font-size: 1.4vw; +} +.readmore:hover{ + text-decoration: underline; +} +h1 a{ + text-decoration: none; + color: white; +} +.read{ + padding-top: 5vw; + width: 82%; + margin: 0 auto; +} \ No newline at end of file diff --git a/P13_ContentAggregator/P13_YashAgarwal/templates/index.html b/P13_ContentAggregator/P13_YashAgarwal/templates/index.html new file mode 100644 index 00000000..28c7b3a8 --- /dev/null +++ b/P13_ContentAggregator/P13_YashAgarwal/templates/index.html @@ -0,0 +1,21 @@ + + + + Content Master + + + +

Content Master

+
+ {% for key, value in content.items() %} +
+

{{key}}

+ {% for i in range(0,10) if i{{value[i][2]}}
+ {% endfor %} + Read More.. +
+ {% endfor %} +
+ + diff --git a/P13_ContentAggregator/P13_YashAgarwal/templates/readmore.html b/P13_ContentAggregator/P13_YashAgarwal/templates/readmore.html new file mode 100644 index 00000000..1b4a45ca --- /dev/null +++ b/P13_ContentAggregator/P13_YashAgarwal/templates/readmore.html @@ -0,0 +1,21 @@ + + + + Content Master + + + +

Content Master

+
+ {% for key, value in content.items() %} +
+

{{key}}

+ {% for i in value %} + {{i[1]}}
+ {% endfor %} +
+ {% endfor %} +
+ + +