Skip to content

Commit

Permalink
Merge pull request #30 from isumit19/master
Browse files Browse the repository at this point in the history
Merged
  • Loading branch information
gdsoumya authored Jul 6, 2019
2 parents da92b2d + 9dc0ab4 commit f6acdf0
Show file tree
Hide file tree
Showing 30 changed files with 385 additions and 0 deletions.
18 changes: 18 additions & 0 deletions P13_ContentAggregator/ContentAggregator/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Content Aggregator
GDTC Hack_in Project

### Description
The project implements WebScraping from three websites ( AndroidPolice.com, Toptal.com and Careers360.com )
<ul>
<li><a href="https://www.androidpolice.com/">AndroidPolice</a> - Top Tech News
<li><a href="https://www.toptal.com/developers/blog">Toptal</a> - Developers Blog Headlines
<li><a href="https://engineering.careers360.com/colleges/ranking">Careers360</a> - Top Engineering Colleges
</ul>

## Screenshots

<img src="Screenshots/1.jpg">
<img src="Screenshots/7.jpg">
<img src="Screenshots/2.jpg">
<img src="Screenshots/8.jpg">
<img src="Screenshots/3.jpg">
26 changes: 26 additions & 0 deletions P13_ContentAggregator/ContentAggregator/ScapeWeb/AndroidPolice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import requests
import sqlite3 as sq
from bs4 import BeautifulSoup as bs


def scrape(db):
url = "https://www.androidpolice.com/"
r = requests.get(url).content
content = bs(r, "html.parser")
con = db.connect()
if con is not None:
for i in reversed(content.find_all('header', class_="post-header")):
try:
x = i.find_all('a')[1]
cursor = con.cursor()
cursor.execute("Insert into content_agg(source, title, url) values('AndroidPolice',?,?)", (x.text.strip(), x['href'].strip()))

except sq.IntegrityError as e:
pass
except Exception as e:
print("Error : ", e)
con.commit()
con.close()
return True
else:
return False
28 changes: 28 additions & 0 deletions P13_ContentAggregator/ContentAggregator/ScapeWeb/TopTal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import requests
import sqlite3 as sq
from bs4 import BeautifulSoup as bs


def scrape(db):
for k in range(1, 4):
url = "https://www.toptal.com/developers/blog?page=" + str(k)
r = requests.get(url).content
content = bs(r, "html.parser")
con = db.connect()
if con is not None:
for i in reversed(content.find_all('div', class_="blog_post_card-content")):
try:
title = i.find('div', class_="blog_post_card__title").text.strip()
url = i.find('a', class_="blog_post_card__title-link")['href'].strip()
cursor = con.cursor()
cursor.execute("Insert into content_agg(source,title, url) values('TopTal',?,?)", (title, url))
except sq.IntegrityError as e:
pass
except Exception as e:
print("Error : ", e)
con.commit()
con.close()
return True
else:
return False

Binary file not shown.
Binary file not shown.
Binary file not shown.
29 changes: 29 additions & 0 deletions P13_ContentAggregator/ContentAggregator/ScapeWeb/college.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import requests
from bs4 import BeautifulSoup as bs
import sqlite3 as sq


def scapecollege(db):
url = "https://engineering.careers360.com/colleges/ranking"
r = requests.get(url).content
content = bs(r, "html.parser")

con = db.connectcollege()
if con is not None:

for i in reversed(content.find_all('td',class_="colgName")):
try:

title = i.text.strip()
url = i.a['href'].strip()
cursor = con.cursor()
cursor.execute("Insert into college(title, url) values(?,?)",(title, url))
except sq.IntegrityError as e:
pass
except Exception as e:
print("Error : ", e)
con.commit()
con.close()
return True
else:
return False
66 changes: 66 additions & 0 deletions P13_ContentAggregator/ContentAggregator/ScrapeJob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from database import database as db
import time
from ScapeWeb import AndroidPolice, TopTal, college
source = ['AndroidPolice', 'TopTal']
name = ['AndroidPolice', 'TopTal']


def scrapeAll():
if not AndroidPolice.scrape(db):
print("ERROR: AndroidPolice SCRAPE ERROR")
if not TopTal.scrape(db):
print("ERROR : TopTal SCRAPE ERROR")
if not college.scapecollege(db):
print("ERROR : 360Careers SCRAPE ERROR")
print("SCRAPING COMPLETE")


def getContent():
content = {}
conn = db.connect()
c = conn.cursor()
for j, i in enumerate(source):
z = c.execute("Select * from content_agg where source='{}' order by rowid desc;".format(i))
content[name[j]] = z.fetchall()
if not content[name[j]]:
conn.close()
return None
conn.close()
return content


def getContentForSource(s):
if s in source:
i = source.index(s)
content = {}
conn = db.connect()
c = conn.cursor()
z = c.execute("Select * from content_agg where source='{}' order by rowid desc;".format(s))
content[name[i]] = z.fetchall()
if not content[name[i]]:
conn.close()
return None
conn.close()
return content
else:
return None


def scrapeStart():
while True:
scrapeAll()
time.sleep(10)


def getContentCollege():
content={}
conn = db.connectcollege()
c = conn.cursor()
head = "Top Colleges"
z = c.execute("Select * from college order by rowid desc;")
content[head] = z.fetchall()
if not content[head]:
conn.close()
return None
conn.close()
return content
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
50 changes: 50 additions & 0 deletions P13_ContentAggregator/ContentAggregator/database/database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import sqlite3
from sqlite3 import Error


def connect():
try:
conn = sqlite3.connect('content.db')
sql_create_projects_table = "CREATE TABLE IF NOT EXISTS content_agg (source text NOT NULL,title text NOT NULL,url text NOT NULL, CONSTRAINT PK PRIMARY KEY(source,title,url));"
if conn is not None:
create_table(conn, sql_create_projects_table)
return conn
else:
print("Error! cannot create the database connection.")
return None
except Error as e:
print(e)
return None


def create_table(conn, create_table_sql):
try:
c = conn.cursor()
c.execute(create_table_sql)
conn.commit()
except Error as e:
print("Error :", e)


def connectcollege():
try:
conn = sqlite3.connect('content.db')
sql_create_projects_table = "CREATE TABLE IF NOT EXISTS college (title text NOT NULL,url text NOT NULL, CONSTRAINT PK PRIMARY KEY(title,url));"
if conn is not None:
create_table(conn, sql_create_projects_table)
return conn
else:
print("Error! cannot create the database connection.")
return None
except Error as e:
print(e)
return None


def create_tablecollege(conn, create_table_sql):
try:
c = conn.cursor()
c.execute(create_table_sql)
conn.commit()
except Error as e:
print("Error :", e)
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
66 changes: 66 additions & 0 deletions P13_ContentAggregator/ContentAggregator/static/style.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
body{
font-family: monospace, serif;
background-image: linear-gradient(147deg, #000000 0%, #2c3e50 74%);
height: 100%;
width: 100%;
z-index: 1;
background-repeat: no-repeat;
color: white;
}
* {
margin: 0px;
padding: 0px;
}
h1{
padding: 1vw;
text-align: center;
text-transform: uppercase;
font-size: 4vw;
font-weight: bolder;
width: 100%;
}
.content{
display: flex;
flex-wrap: wrap;
justify-content: space-around;
padding-top: 7vw;
}
.content h2{
font-size: 2vw;
font-weight: bolder;
margin: 1vw 1vw 2vw 0vw;
}
.contenta{
display: block;
text-decoration: none;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
font-size: 1vw
}
.contenta:hover{
text-decoration: underline;
}
.data{
width: 35%;
margin: 2vw;
text-overflow: ellipsis;
}
.readmore{
text-decoration: none;
color: #c45454;
font-weight: bold;
font-size: 1.4vw;
}
.readmore:hover{
text-decoration: underline;
}
a{
text-decoration: none;
color: white;
}
a.button {
font-size: 1vw;
text-decoration: none;
color: white;
}
20 changes: 20 additions & 0 deletions P13_ContentAggregator/ContentAggregator/templates/college.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<!DOCTYPE html>
<html>
<head>
<title>Content Aggregator</title>
<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
</head>
<body>
<h1>Content Aggregator</h1>
<div class='content'>
{% for key, value in content.items() %}
<div class='data'>
<h2>{{key}}</h2>
{% for i in range(0,value|length) %}
<a class='contenta' target="_blank" href={{value[i][1]}}>{{i+1}}. {{value[i][0]}}</a><br>
{% endfor %}
</div>
{% endfor %}
</div>
</body>
</html>
28 changes: 28 additions & 0 deletions P13_ContentAggregator/ContentAggregator/templates/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<!DOCTYPE html>
<html>
<head>
<title>Content Aggregator</title>
<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
</head>
<body>
<h1>Content Aggregator</h1>
<div class='content'>
{% for key, value in content.items() %}
<div class='data'>
<h2>{{key}}</h2>
{% for i in range(0,10) if i<value|length %}
<a class='contenta' target="_blank" href={{value[i][2]}}>{{value[i][1]}}</a><br>
{% endfor %}
<a class='readmore' href="{{url_for('.readmore',source=content[key][0][0])}}">Read More..</a>
</div>
{% endfor %}
</div>
<center>
<div class="college">
<h2>Top Colleges of India</h2>
<a class="button" href="{{url_for('.college')}}">Visit Here!!</a>
</center>


</body>
</html>
20 changes: 20 additions & 0 deletions P13_ContentAggregator/ContentAggregator/templates/readmore.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<!DOCTYPE html>
<html>
<head>
<title>Content Aggregator</title>
<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
</head>
<body>
<h1>Content Aggregator</h1>
<div class='content'>
{% for key, value in content.items() %}
<div class='data'>
<h2>{{key}}</h2>
{% for i in range(0,value|length) %}
<a class='contenta' target="_blank" href={{value[i][2]}}>{{value[i][1]}}</a><br>
{% endfor %}
</div>
{% endfor %}
</div>
</body>
</html>
Loading

0 comments on commit f6acdf0

Please sign in to comment.