forked from bernorieder/YouTube-Data-Tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmod_channels_related.php
233 lines (151 loc) · 6.52 KB
/
mod_channels_related.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
<?php include("html_head.php"); ?>
<div class="rowTab">
<div class="sectionTab">
<h1>Related Channel Network Module</h1>
</div>
</div>
<div class="rowTab">
<div class="fullTab">
<p>This module starts from a list of seeds and crawls networks of channels connected via the "<a href="https://support.google.com/youtube/answer/7216516?hl=en" target="_blank">related channels</a>" panel which is generated by Google
(the channel network module uses featured channels and subscriptions as connections, which are set byt the channel owner).</p>
<p>Related channel data are not retrieved via the API, but scraped from YouTube's web interface. To reduce the number of requests, this module caches channel data for three days. Remember that channels can
<a href="https://support.google.com/youtube/answer/7216516?hl=en" target="_blank">opt out of this feature</a> and will thus not appear in the network.</p>
<p>Crawl depth specifies how far from the seeds the script should go. Crawl depth 0 will get only the relations between seeds. Using many seeds and the maximum crawl depth (3) can take a very long time or the script might run out of memory or get blocked. Start small.</p>
<p>NB: since graph analysis software can have difficulties with very large numbers, channels' viewcount is given in 100s.</p>
</div>
</div>
<div class="rowTab">
<div class="sectionTab"><h1>Parameters</h1></div>
</div>
<form action="mod_channels_related.php" method="post">
<div class="rowTab">
<div class="sectionTab"><h2>1) choose a starting point:</h2></div>
</div>
<div class="rowTab">
<div class="oneTab"></div>
<div class="twoTab">Seeds:</div>
<div class="threeTab">
<textarea name="seeds"><?php echo $_POST["seeds"]; ?></textarea>
</div>
<div class="fourTab">(channel ids, comma separated)</div>
</div>
<div class="rowTab">
<div class="sectionTab"><h2>2) set additional parameters:</h2></div>
</div>
<div class="rowTab">
<div class="oneTab"></div>
<div class="twoTab">Crawl depth:</div>
<div class="threeTab"><input type="text" name="crawldepth" max="3" value="<?php echo (isset($_POST["crawldepth"])) ? $_POST["crawldepth"]:1; ?>" /></div>
<div class="fourTab">(values are 0, 1, 2 or 3)</div>
</div>
<div class="rowTab">
<div class="oneTab"></div>
<div class="fourTab"><input type="submit" /></div>
</div>
</form>
<?php
if(isset($_POST["seeds"])) {
$crawldepth = $_POST["crawldepth"];
$nodes = array();
$edges = array();
echo '<div class="rowTab">
<div class="sectionTab"><h1>Result</h1></div>
</div>
<div class="rowTab">Processing:';
if($_POST["crawldepth"] > 4 || preg_match("/\D/", $_POST["crawldepth"])) {
echo "<br /><br />Wrong crawldepth.";
exit;
}
$seeds = $_POST["seeds"];
$seeds = preg_replace("/\s+/","",$seeds);
$seeds = trim($seeds);
$ids = explode(",",$seeds);
$no_seeds = count($ids);
//print_r($ids); exit;
makeNetworkFromIds(0);
echo '</div>';
}
function makeNetworkFromIds($depth) {
global $apikey,$nodes,$edges,$ids,$crawldepth;
echo "<br /><br />getting details for ".count($ids)." channels at depth ".$depth.": ";
$newids = array();
for($i = 0; $i < count($ids); $i++) {
$chid = $ids[$i];
$jsonfn = "./cache/channelinfo_" . $chid . ".json";
if (file_exists($jsonfn) && $delta < (60 * 60 * 24 * 3)) {
$nodes[$chid] = json_decode(file_get_contents($jsonfn));
} else {
$restquery = "https://www.googleapis.com/youtube/v3/channels?part=id,snippet,statistics&id=".$chid."&key=".$apikey;
$reply = doAPIRequest($restquery);
if(isset($reply->items[0])) {
$nodes[$chid] = $reply->items[0];
if($depth == 0) {
$nodes[$chid]->isSeed = "yes";
$nodes[$chid]->seedRank = ($i + 1);
} else {
$nodes[$chid]->isSeed = "no";
$nodes[$chid]->seedRank = "";
}
$html = file_get_contents("https://www.youtube.com/channel/" . $chid);
preg_match_all('/branded-page-related-channels-item.+data-external-id=\"(.*)\">/', $html, $matches, PREG_OFFSET_CAPTURE);
$nodes[$chid]->matches = $matches[1];
file_put_contents($jsonfn, json_encode($nodes[$chid]));
}
}
echo $i . " "; flush(); ob_flush();
}
foreach($nodes as $nodeid => $nodedata) {
//print_r($nodedata->matches);
//exit;
if(count($nodedata->matches) > 0) {
foreach($nodedata->matches as $match) {
$featid = $match[0];
if(!isset($nodes[$featid])) {
if(!in_array($featid, $newids)) {
$newids[] = $featid;
}
if($depth < $crawldepth) {
$edgeid = $nodeid . "_|_|X|_|_" . $featid;
$edges[$edgeid] = true;
}
} else {
$edgeid = $nodeid . "_|_|X|_|_" . $featid;
$edges[$edgeid] = true;
}
}
}
}
if($depth == $crawldepth) {
renderNetwork();
} else {
$ids = $newids;
$depth++;
makeNetworkFromIds($depth);
}
}
function renderNetwork() {
global $nodes,$edges,$lookup,$no_seeds,$mode;
$nodegdf = "nodedef>name VARCHAR,label VARCHAR,isSeed VARCHAR,seedRank INT,subscriberCount INT,videoCount INT,viewCount(100s) INT,country VARCHAR,publishedAt VARCHAR,daysactive INT\n";
foreach($nodes as $nodeid => $nodedata) {
$nodedata->statistics->viewCount = round($nodedata->statistics->viewCount / 100);
$nodedata->snippet->country = (isset($nodedata->snippet->country)) ? $nodedata->snippet->country:"not set";
$daysactive = round((time() - strtotime($nodedata->snippet->publishedAt)) / (60 * 60 * 24));
$nodegdf .= $nodeid . "," . preg_replace("/,|\"|\'/"," ",$nodedata->snippet->title) . "," . $nodedata->isSeed . "," . $nodedata->seedRank . "," . $nodedata->statistics->subscriberCount . "," . $nodedata->statistics->videoCount . "," . $nodedata->statistics->viewCount . "," . $nodedata->snippet->country . "," . $nodedata->snippet->publishedAt . "," . $daysactive . "\n";
}
$edgegdf = "edgedef>node1 VARCHAR,node2 VARCHAR,directed BOOLEAN\n";
foreach($edges as $edgeid => $edgedata) {
$tmp = explode("_|_|X|_|_",$edgeid);
if(isset($nodes[$tmp[0]]) && isset($nodes[$tmp[1]])) {
$edgegdf .= $tmp[0] . "," . $tmp[1] . ",true\n";
}
}
$gdf = $nodegdf . $edgegdf;
$filename = "channelnet_" . $mode . $no_seeds . "_nodes" . count($nodes) . "_" . date("Y_m_d-H_i_s");
file_put_contents("./data/".$filename.".gdf", $gdf);
echo '<br /><br />The script has created a net with '.count($nodes).' channels from '.$no_seeds.' seeds.<br /><br />
your files:<br />
<a href="./data/'.$filename.'.gdf" download>'.$filename.'.gdf</a><br />';
}
?>
</body>
</html>