-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaitm_A3_youtube_transcript_scrape.py
134 lines (107 loc) · 4.53 KB
/
aitm_A3_youtube_transcript_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
import googleapiclient.discovery
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi
import matplotlib.pyplot as plt
# Set up YouTube API client
api_service_name = "youtube"
api_version = "v3"
api_key = "AIzaSyAHjM4--WZZCqINCUgwbvSoM-K1iwPLpEA"
youtube = googleapiclient.discovery.build(
api_service_name, api_version, developerKey=api_key)
# Function to search for videos
def search_videos(query, start_date, end_date, max_results=100):
request = youtube.search().list(
part="id,snippet",
type="video",
q=query,
videoDefinition="high",
publishedAfter=start_date,
publishedBefore=end_date,
maxResults=max_results,
regionCode="AU", # Prioritize search results more relevant to viewers in Australia
fields="items(id(videoId),snippet(publishedAt,channelId,title,channelTitle))"
)
response = request.execute()
return response['items']
# Function to get video transcripts using youtube_transcript_api
def get_transcript(video_id):
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
# Check if there's an English transcript, and get it
if 'en' in [transcript.language_code for transcript in transcript_list]:
transcript = transcript_list.find_transcript(['en'])
transcript_text = ' '.join([entry['text'] for entry in transcript.fetch()])
return transcript_text
else:
print(f"No English transcript found for video {video_id}")
return None
except Exception as e:
print(f"Error retrieving transcript for video {video_id}: {e}")
return None
# Perform search and scrape video titles and transcripts for each year
query = "medicinal cannabis australia"
start_year = 2013
end_year = 2023
video_data = []
transcripts_per_year = {}
for year in range(start_year, end_year + 1):
start_date = f"{year}-01-01T00:00:00Z"
end_date = f"{year}-12-31T23:59:59Z"
print(f"Searching videos for year {year}...")
videos = search_videos(query, start_date, end_date)
transcripts_per_year[year] = 0
for video in videos:
video_id = video['id']['videoId']
title = video['snippet']['title']
published_at = video['snippet']['publishedAt']
transcript = get_transcript(video_id)
if transcript is not None:
print(f"Successfully retrieved transcript for video '{title}' published on {published_at}")
transcripts_per_year[year] += 1
video_data.append({'video_id': video_id, 'title': title, 'published_at': published_at, 'transcript': transcript})
df = pd.DataFrame(video_data)
# Add word count columns for title and transcript
df['title_word_count'] = df['title'].apply(lambda x: len(x.split()))
df['transcript_word_count'] = df['transcript'].apply(lambda x: len(x.split()) if pd.notnull(x) else 0)
# Convert the published_at column to datetime objects
df['published_at'] = pd.to_datetime(df['published_at'])
# Create a new column with a standardized date format (e.g., YYYY-MM-DD)
df['date_std'] = df['published_at'].dt.strftime('%Y-%m-%d')
# Save DataFrame to a CSV file
df.to_csv("medicinal_cannabis_videos_by_year.csv", index=False)
# Plot the number of video transcripts retrieved per year
years = list(transcripts_per_year.keys())
transcript_counts = list(transcripts_per_year.values())
plt.bar(years, transcript_counts)
plt.xlabel('Year')
plt.ylabel('Number of Transcripts')
plt.title('Video Transcripts Successfully Retrieved by Year')
plt.xticks(years)
plt.show()
# Print the DataFrame
print(df)
import numpy as np
import seaborn as sns
# Set a seaborn style for better aesthetics
sns.set_style("whitegrid")
# Plot the number of video transcripts retrieved per year
years = list(transcripts_per_year.keys())
transcript_counts = list(transcripts_per_year.values())
# Create a color map to make bars visually appealing
cmap = plt.get_cmap("viridis")
colors = cmap(np.linspace(0, 1, len(years)))
# Plot the bars with custom colors, edge color, and width
plt.bar(years, transcript_counts, color=colors, edgecolor='black', width=0.8)
# Customize x-axis and y-axis labels
plt.xlabel('Year', fontsize=12)
plt.ylabel('Number of Transcripts', fontsize=12)
# Customize the title
plt.title('Video Transcripts Successfully Retrieved by Year', fontsize=14, fontweight='bold')
# Customize the x-axis ticks and rotate them if necessary
plt.xticks(years, rotation=45)
# Remove the top and right spines for better aesthetics
sns.despine()
plt.ylim(0, 50)
# Show the plot
plt.show()