-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape-keyword.py
66 lines (58 loc) · 2.88 KB
/
scrape-keyword.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# Import necessary libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
# Initialize an empty list to hold the scraped data
data=[]
# Define the keyword(s) to be used for the search
keywords ='kidnap'
# Define a function to scrape news articles from a given webpage for a specific keyword
def find_news(keyword,pageNo):
# Define the base url of the website to be scraped
base_url = 'https://punchng.com/page/'
# Define the specific search query to be used for the keyword
key= '/?s='+ keyword
# Combine the base url, page number and search query to form the complete url to be scraped
url = base_url+ str(pageNo)+key
# Use the requests library to fetch the HTML content of the page
html_text= requests.get(url).text
# Use BeautifulSoup to parse the HTML content
soap = BeautifulSoup(html_text,'html.parser')
# Find all the articles on the page
articles = soap.find_all('article')
# Loop through each article and extract relevant data
for article in articles:
# Find the article title
title = article.find_all('h1',class_='post-title')
if title is not None:
# Loop through all the titles (there might be more than one) and extract the text content
for i in title:
title = i.text.strip()
# Find the brief summary of the article
content= article.find('p', class_='post-excerpt').text.strip()
# Find the link to the full article
links= i.a['href']
# Use requests to fetch the full article page
article_response = requests.get(links)
# Use BeautifulSoup to parse the full article page
article_soup = BeautifulSoup(article_response.text, 'html.parser')
# Find the full content of the article
news_=article_soup.find_all('div',class_='post-content')
# Find the date the article was published
pub_date =article_soup.find('span',class_='post-date').text.strip()
# Loop through all the full content sections (there might be more than one) and extract the text content
for news_article in news_:
news = news_article.find('p')
if news is not None:
# Append the extracted information to the data list
data.append({'Date_published':pub_date,'Title': title, 'Brief_summary':content,
'Full_Content': news.text.strip()})
# Loop through a specified number of pages and call the find_news function for each page
for x in range(1,3):
find_news(keywords,x)
# Convert the data list into a pandas dataframe
df=pd.DataFrame(data)
# Exporting the dataframe to an Excel file
df.to_excel('Kidnap.xlsx')
# Print a success message to the console
print('Exported Successfully')