-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdemo.R
executable file
·71 lines (55 loc) · 3.59 KB
/
demo.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# This is the Demo file for the R diffbot API
cat('Diffbot Demo for R');
# STEP 1: Source the function
source("diffbot.R")
# STEP 2: Argumnts for the function
url <- 'http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web'
token <- 'DIFFBOT_TOKEN'
api <- 'analyze'
fields <- c('author', 'title','type','tags', 'images(*)')
version <- 2
# Possible Values for fields:
# url URL submitted. Returned by default.
# resolved_url Returned if the resolving URL is different from the submitted URL (e.g., link shortening services). Returned by default.
# icon Page favicon. Returned by default.
# meta Returns the full contents of page meta tags, including sub-arrays for OpenGraph tags, Twitter Card metadata, schema.org microdata, and -- if available -- oEmbed metadata. Returned with fields.
# querystring Returns the key/value pairs of the URL querystring, if present. Items without a value will be returned as "true." Returned with fields.
# links Returns all links (anchor tag href values) found on the page. Returned with fields.
# type Type of page -- always article. Returned by default.
# title Title of extracted article. Returned by default.
# text Plain-text of the extracted article. Returned by default.
# html HTML of the extracted article. Returned by default.
# numPages Number of pages automatically concatenated to form the text or html response (By default, Diffbot will automatically concatenate multiple-page articles.)
# date Article date, normalized in most cases to GMT. Returned by default.
# author Article author. Returned by default.
# tags Array of tags, automatically generated by Diffbot natural-language-processing. Returned with fields.
# humanLanguage Returns the (spoken/human) language of the submitted URL, using two-letter ISO 639-1 nomenclature. Returned with fields.
# images Array of images, if present within the article body. Returned by default.
# url Direct (fully resolved) link to image.
# pixelHeight Image height, in pixels.
# pixelWidth Image width, in pixels.
# caption Diffbot-determined best caption for the image, if detected.
# primary Returns "true" if image is identified as primary based on visual analysis of the page.
# videos Array of videos, if present within the article body. Returned by default.
# url Direct (fully resolved) link to the video content.
# pixelHeight Video height, in pixels, if accessible.
# pixelWidth Video width, in pixels, if accessible.
# primary Returns "true" if the video is identified as primary based on visual analysis of the page.
cat('Reading Response from API Please wait...')
r_return <- diffbot(url, token, api, fields, version)
cat('Done...')
# Display Author:
cat(paste0('Author Name: ', r_return$author))
# Display Title:
cat(paste0('Title: ',r_return$title))
# Display URL:
cat(paste0('URL: ',r_return$url))
# Show Images Retrieved
# Total Images Found
n <- length(r_return$images)
cat(paste0('Please Wait! Downloading images from webpage to the working directory: ', getwd()))
for (i in 1:n) {
r_return$images[[i]] <- as.list(r_return$images[[i]])
tmp <- unlist(strsplit(r_return$images[[i]]$url, split="[/]")) # get filename
download.file(r_return$images[[i]]$url, destfile=tmp[length(tmp)], mode="wb")
}