-
Notifications
You must be signed in to change notification settings - Fork 0
/
Resource_scrape.py
108 lines (91 loc) · 3.72 KB
/
Resource_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import numpy as np
import time
import operator
from operator import itemgetter
import urllib
import json
import urllib.parse
from urllib import request
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
import lxml
from lxml import etree
from fuzzywuzzy import fuzz
from utils import load_from_mongodb
from utils import store_to_mongodb
from Prototype1_final import ResourceRecommendation
import json
SCROLL_PAUSE_TIME = 0.5
obj = ResourceRecommendation()
list_of_KPs = obj.load_KPs()
KP_List = list_of_KPs
list_of_names = obj.load_KP_names()
KP_names = list_of_names
KP_metadata = pd.DataFrame(columns = ['KP_name'], index = KP_List, data = KP_names)
driver = webdriver.Firefox(executable_path= r'C:\Users\nguyennamminhquan\work\geckodriver.exe')
#driver.get(r"https://www.youtube.com/playlist?list=PLCd8j6ZYo0lY8ZFrhrAyzCzuo5x9YIrAm")
driver.get(r"https://www.youtube.com/playlist?list=PLCd8j6ZYo0lbwm8pL2Dvr7xs23FovsKXI")
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
def get_youtube_title(youtube_id):
youtube_watch_url = r'https://www.youtube.com/watch?v='
youtube_watch_url += youtube_id
youtube = etree.HTML(request.urlopen(youtube_watch_url).read().decode(r'utf-8'))
video_title = youtube.xpath(r"//title")[0].text
return video_title
user_data = driver.find_elements_by_xpath(r'//*[@id="video-title"]')
links = []
for i in user_data:
links.append(i.get_attribute('href'))
resource_kp = pd.DataFrame(columns = ['title', 'link', 'KP_available'])
def getIndexes(dfObj, value):
''' Get index positions of value in dataframe i.e. dfObj.'''
listOfPos = list()
# Get bool dataframe with True at positions where the given value exists
result = dfObj.isin([value])
# Get list of columns that contains the value
seriesObj = result.any()
columnNames = list(seriesObj[seriesObj == True].index)
# Iterate over list of columns and fetch the rows indexes where value exists
for col in columnNames:
rows = list(result[col][result[col] == True].index)
for row in rows:
listOfPos.append(row)
# Return a list of tuples indicating the positions of value in the dataframe
return listOfPos[0]
#sorting resources
def classify(x):
KP_contained = list()
str2 = x
for i in range(len(KP_names)):
str1 = KP_names[i]
ratio = fuzz.token_sort_ratio(str1.lower(), str2.lower())
if ratio >= 50:
KP_contained.append(getIndexes(KP_metadata, str1))
return KP_contained
wait = WebDriverWait(driver, 10)
for x in links:
driver.get(x)
v_id = x[32:]
#v_title = driver.find_element_by_xpath('//*[@id="video-title"]').text
v_title = get_youtube_title(v_id)
final_title = v_title[25:-9]
KP_available = classify(final_title)
v_link = r'https://www.youtube.com/watch?v='+ v_id
resource_kp.loc[len(resource_kp)] = [v_title, v_link, KP_available]
resource_kp.to_csv(r'C:\Users\nguyennamminhquan\Desktop\Assignments\Work stuffs\lop_6_hinh.csv')