demo.R

# This is the Demo file for the R diffbot API

cat('Diffbot Demo for R');

# STEP 1: Source the function
source("diffbot.R")

# STEP 2: Argumnts for the function
url <- 'http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web'
token <- 'DIFFBOT_TOKEN'
api <- 'analyze'
fields <- c('author', 'title','type','tags', 'images(*)')
version <- 2


# Possible Values for fields:
# url                 URL submitted. Returned by default.
# resolved_url        Returned if the resolving URL is different from the submitted URL (e.g., link shortening services). Returned by default.
# icon                Page favicon. Returned by default.
# meta                Returns the full contents of page meta tags, including sub-arrays for OpenGraph tags, Twitter Card metadata, schema.org microdata, and -- if available -- oEmbed metadata. Returned with fields.
# querystring         Returns the key/value pairs of the URL querystring, if present. Items without a value will be returned as "true." Returned with fields.
# links               Returns all links (anchor tag href values) found on the page. Returned with fields.
# type                Type of page -- always article. Returned by default.
# title               Title of extracted article. Returned by default.
# text                Plain-text of the extracted article. Returned by default.
# html                HTML of the extracted article. Returned by default.
# numPages            Number of pages automatically concatenated to form the text or html response (By default, Diffbot will automatically concatenate multiple-page articles.)
# date                Article date, normalized in most cases to GMT. Returned by default.
# author              Article author. Returned by default.
# tags                Array of tags, automatically generated by Diffbot natural-language-processing. Returned with fields.
# humanLanguage       Returns the (spoken/human) language of the submitted URL, using two-letter ISO 639-1 nomenclature. Returned with fields.
# images              Array of images, if present within the article body. Returned by default.
# url                 Direct (fully resolved) link to image.
# pixelHeight         Image height, in pixels.
# pixelWidth          Image width, in pixels.
# caption             Diffbot-determined best caption for the image, if detected.
# primary             Returns "true" if image is identified as primary based on visual analysis of the page.
# videos              Array of videos, if present within the article body. Returned by default.
# url                 Direct (fully resolved) link to the video content.
# pixelHeight         Video height, in pixels, if accessible.
# pixelWidth          Video width, in pixels, if accessible.
# primary             Returns "true" if the video is identified as primary based on visual analysis of the page.


cat('Reading Response from API Please wait...')
r_return <- diffbot(url, token, api, fields, version)
cat('Done...')


# Display Author:
cat(paste0('Author Name: ', r_return$author))

# Display Title:
cat(paste0('Title: ',r_return$title))


# Display URL:
cat(paste0('URL: ',r_return$url))


# Show Images Retrieved
# Total Images Found
n <- length(r_return$images)

cat(paste0('Please Wait! Downloading images from webpage to the working directory: ', getwd()))
for (i in 1:n) {
  r_return$images[[i]] <- as.list(r_return$images[[i]])
  tmp <- unlist(strsplit(r_return$images[[i]]$url, split="[/]")) # get filename
  download.file(r_return$images[[i]]$url, destfile=tmp[length(tmp)], mode="wb")
}