Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Task4 #77

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
113 changes: 113 additions & 0 deletions Rahul/task4/Crawler/crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from bs4 import BeautifulSoup
import requests
import pymongo
import urllib
import sys
from popularlinks import Popularity
import lxml


class Crawler():

connection_url = "mongodb://127.0.0.1:27017/"

client = pymongo.MongoClient(connection_url)

db = client.gluglefinal

disallowed_links = []

def start_crawl(self, url, depth):
robots_url = urllib.parse.urljoin(url, '/robots.txt')

try:
robots = requests.get(robots_url)
except:
print("robots not found!!!")
self.crawl(url, depth)

soup = BeautifulSoup(robots.text, 'lxml')

sample_content = soup.find('p').text
content = sample_content.split()
for word in content:
if word[0] == '/':
self.disallowed_links.append(urllib.parse.urljoin(url, word))

print("robots found and appended in disallowed_links...")

self.crawl(url, depth, self.disallowed_links)

def crawl(self, url, depth, *disallowed_links):

try:
print(f"Crawling url {url} at depth: {depth}")
response = requests.get(url)
except:
print(f"Failed to perform HTTP GET request on {url}")
return

soup = BeautifulSoup(response.text, 'lxml')

try:
title = soup.find('title').text
description = ''

for tag in soup.findAll():
if tag.name == 'p':
description += tag.text.strip().replace('\n', '')

except:
print("Failed to retrieve title and description...")
return
popularity = Popularity(url)
popularity_score = popularity.popularity_score()
query = {
'url': url,
'title': title,
'description': description,
'score': 0,
'popularity': popularity_score,
}

search_results = self.db.search_results

search_results.insert_one(query)

search_results.create_index(
[
('url', pymongo.TEXT),
('title', pymongo.TEXT),
('description', pymongo.TEXT),
('score', 1),
('popularity',1)
],
name='search_results',
default_language="english"
)

if depth == 0:
return

links = soup.findAll('a')

for link in links:
try:
if link['href'] not in disallowed_links[0]:
if 'http' in link['href']:
self.crawl(link['href'], depth-1, disallowed_links[0])
else:
link['href'] = urllib.parse.urljoin(url, link['href'])
self.crawl(link['href'], depth-1, disallowed_links[0])
except KeyError:
print("no links retrieved from the page")
pass

self.client.close()


crawler = Crawler()

crawler.start_crawl(
sys.argv[1], int(sys.argv[2])
)
18 changes: 18 additions & 0 deletions Rahul/task4/Crawler/popularlinks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
class Popularity():
popular_domains = ['https://en.wikipedia.org/', 'https://www.python.org/', 'https://www.rottentomatoes.com/',
'https://pypi.org/', 'https://www.indiatoday.in/', 'https://www.geeksforgeeks.org/',
'https://stackoverflow.com/']

ps = 0

def __init__(self, url):
self.url = url

def popularity_score(self):
for domain in self.popular_domains:
if domain == self.url:
self.ps += 100/len(self.popular_domains)
if domain in self.url:
self.ps += 100/len(self.popular_domains)

return self.ps
37 changes: 37 additions & 0 deletions Rahul/task4/Templates/base.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<!doctype html>
<html lang="en">

<head>
<!-- Required meta tags -->
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">

<!-- Bootstrap CSS -->
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
integrity="sha384-F3w7mX95PdgyTmZZMECAngseQB83DfGTowi0iMjiWaeVhAn4FJkqJByhZMI3AhiU" crossorigin="anonymous">

<style>
.form-control:focus {
border-color: #43971b;
box-shadow: 0px 1px 1px rgba(0, 0, 0, 0.075) inset, 0px 0px 8px rgba(255, 100, 255, 0.5);
}

.dropdown {
float: left;
}
</style>

<title>Glugle</title>
</head>

<body style="background-color: rgb(224, 226, 226);">
{% block content %}{% endblock %}


<!-- Option 1: Bootstrap Bundle with Popper -->
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"
integrity="sha384-/bQdsTh/da6pkI1MST/rWKFNjaCP5gBSY4sEBT38Q/9RBh9AH40zEOg7Hlq2THRZ"
crossorigin="anonymous"></script>
</body>

</html>
22 changes: 22 additions & 0 deletions Rahul/task4/Templates/home.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{% extends 'base.html' %}

{% block content %}
<div class="col">


<div class="col mt-5">
<img src="{{url_for('static' , filename='./images/logo.png')}}" class="mx-auto d-block"
style="width: 25%; height: 25%;">
<h1 style="color:rgb(91, 94, 92); text-align:center;font-family: 'Times New Roman', Times, serif;margin-top: 15px;padding: 15px;"
class="mt-2"><b>Glugle Search</b></h2>
</div>


<form class="mt-5 container" name="search" style="width: 50%" action="/search_results">
<div class="col-6 mx-auto input-group">
<input type="text" class="form-control" name="search" placeholder="search...">
<button type="submit" class="btn btn-success">Search</button>
</div>
</form>
</div>
{% endblock %}
40 changes: 40 additions & 0 deletions Rahul/task4/Templates/search_result.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{% extends 'base.html' %}

{% block content %}

<div class="container">
<form class="mt-5 container" name="search" style="width: 50%" action="/search_results">
<div class="col-6 mx-auto input-group">
<input type="text" class="form-control" name="search" placeholder="search...">
<button type="submit" class="btn btn-success">Search</button>
</div>
</form>

<div class="container mx-5 mt-5">
<p style="color: teal;"><b>Search result for '{{search_string}}'</b></p>
</div>

{% if search_result %}
{% for link in search_result %}
<div class="container mt-3" style="width:90%">

<div class="col">
<strong><a href="{{ link.url }}" target="_ blank">{{ link.title }}</a></strong>
</div>

<div class="col">
<a href="{{ link.url }}" target="_blank">{{ link.url }}</a>
<p>{{ link.description[:300] }}...</p>
</div>
</div>
{% endfor %}
{% else %}
<b>No results found</b>
{% endif %}

<div class="col mb-5 mt-2 text-center">
{{pagination.links}}
</div>
</div>

{% endblock %}
Binary file not shown.
Binary file added Rahul/task4/__pycache__/ranking.cpython-39.pyc
Binary file not shown.
75 changes: 75 additions & 0 deletions Rahul/task4/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from flask import Flask, render_template, request
from flask_paginate import Pagination, get_page_args
import pymongo
from ranking import Ranking
import os
from query_processing import QueryProcessing
import time


app = Flask(__name__)


@app.route('/')
def entry_point():
return render_template('home.html')


@app.route('/search_results')
def search_results():
connect_url = 'mongodb://127.0.0.1:27017/'

client = pymongo.MongoClient(connect_url, connect=False)

db = client.glugle

search_string = request.args.get('search')

processor = QueryProcessing(search_string)
keywords = processor.processor()

query = []

start = time.time()

for keyword in keywords:
query.extend(db.search_results.find(
{'$text': {'$search': search_string, '$caseSensitive': False}}))

search_result = []

for doc in query:
exist = False
for result in search_result:
if result['title'] == doc['title'] or result['url'] == doc['url']:
exist = True
break

if exist == False:
search_result.append(doc)

rank = Ranking(search_result,search_string)

ranked_result = rank.sorted_results()

client.close()

page, per_page, offset = get_page_args(page_parameter='page',
per_page_parameter='per_page')

total = len(search_result)

pagination = Pagination(page=page, per_page=per_page, total=total,
css_framework='bootstrap4')

return render_template('search_result.html',
search_result=search_result[offset:offset+per_page],
page=page,
per_page=per_page,
pagination=pagination,
search_string=search_string
)


if __name__ == '__main__':
app.run(debug=True)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pip
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Copyright 2010 Pallets

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Loading