From 36ffe6e15debdbe600830b9cf916052b4325bb0b Mon Sep 17 00:00:00 2001 From: Maarten Jacobs Date: Thu, 1 Feb 2024 15:14:10 +0100 Subject: [PATCH] copy from the latest discourse/lib/onebox --- domain_checker.rb | 12 + engine.rb | 213 ++++++++++++ engine/allowlisted_generic_onebox.rb | 310 ++++++++++++++++++ engine/amazon_onebox.rb | 254 ++++++++++++++ engine/animated_image_onebox.rb | 23 ++ engine/asciinema_onebox.rb | 27 ++ engine/audio_com_onebox.rb | 33 ++ engine/audio_onebox.rb | 30 ++ engine/audioboom_onebox.rb | 29 ++ engine/band_camp_onebox.rb | 35 ++ engine/cloud_app_onebox.rb | 53 +++ engine/coub_onebox.rb | 22 ++ engine/discourse_topic_onebox.rb | 59 ++++ engine/facebook_media_onebox.rb | 33 ++ engine/five_hundred_px_onebox.rb | 18 + engine/flickr_onebox.rb | 52 +++ engine/flickr_shortened_onebox.rb | 16 + engine/gfycat_onebox.rb | 118 +++++++ engine/github_actions_onebox.rb | 96 ++++++ engine/github_blob_onebox.rb | 40 +++ engine/github_commit_onebox.rb | 56 ++++ engine/github_folder_onebox.rb | 68 ++++ engine/github_gist_onebox.rb | 88 +++++ engine/github_issue_onebox.rb | 69 ++++ engine/github_pull_request_onebox.rb | 105 ++++++ engine/gitlab_blob_onebox.rb | 38 +++ engine/google_calendar_onebox.rb | 32 ++ engine/google_docs_onebox.rb | 44 +++ engine/google_drive_onebox.rb | 30 ++ engine/google_maps_onebox.rb | 201 ++++++++++++ engine/google_photos_onebox.rb | 73 +++++ engine/google_play_app_onebox.rb | 44 +++ engine/hackernews_onebox.rb | 50 +++ engine/html.rb | 26 ++ engine/image_onebox.rb | 30 ++ engine/imgur_onebox.rb | 72 ++++ engine/instagram_onebox.rb | 75 +++++ engine/json.rb | 13 + engine/kaltura_onebox.rb | 36 ++ engine/mixcloud_onebox.rb | 35 ++ engine/motoko_onebox.rb | 28 ++ engine/opengraph_image.rb | 12 + engine/pastebin_onebox.rb | 59 ++++ engine/pdf_onebox.rb | 29 ++ engine/pubmed_onebox.rb | 61 ++++ engine/reddit_media_onebox.rb | 54 +++ engine/replit_onebox.rb | 25 ++ engine/simplecast_onebox.rb | 30 ++ engine/sketch_fab_onebox.rb | 36 ++ engine/slides_onebox.rb | 33 ++ engine/sound_cloud_onebox.rb | 33 ++ engine/stack_exchange_onebox.rb | 66 ++++ engine/standard_embed.rb | 203 ++++++++++++ engine/steam_store_onebox.rb | 41 +++ engine/threads_status_onebox.rb | 91 +++++ engine/tiktok_onebox.rb | 60 ++++ engine/trello_onebox.rb | 34 ++ engine/twitch_clips_onebox.rb | 20 ++ engine/twitch_stream_onebox.rb | 15 + engine/twitch_video_onebox.rb | 15 + engine/twitter_status_onebox.rb | 234 +++++++++++++ engine/typeform_onebox.rb | 48 +++ engine/video_onebox.rb | 36 ++ engine/vimeo_onebox.rb | 47 +++ engine/wikimedia_onebox.rb | 43 +++ engine/wikipedia_onebox.rb | 110 +++++++ engine/wistia_onebox.rb | 48 +++ engine/xkcd_onebox.rb | 27 ++ engine/youku_onebox.rb | 35 ++ engine/youtube_onebox.rb | 186 +++++++++++ file_type_finder.rb | 64 ++++ helpers.rb | 279 ++++++++++++++++ json_ld.rb | 46 +++ layout.rb | 55 ++++ layout_support.rb | 17 + matcher.rb | 37 +++ mixins/git_blob_onebox.rb | 243 ++++++++++++++ mixins/github_body.rb | 31 ++ mixins/twitch_onebox.rb | 34 ++ movie.rb | 46 +++ normalizer.rb | 51 +++ oembed.rb | 16 + open_graph.rb | 56 ++++ preview.rb | 111 +++++++ sanitize_config.rb | 111 +++++++ status_check.rb | 44 +++ template_support.rb | 13 + templates/_layout.mustache | 2 +- templates/allowlistedgeneric.mustache | 3 +- templates/discourse_category_onebox.mustache | 34 ++ templates/discourse_topic_onebox.mustache | 10 + templates/discourse_user_onebox.mustache | 32 ++ templates/discoursetopic.mustache | 42 +++ templates/githubactions.mustache | 31 ++ templates/githubblob.mustache | 85 ++--- templates/githubcommit.mustache | 2 +- templates/githubfolder.mustache | 2 - templates/githubgist.mustache | 4 +- templates/githubissue.mustache | 8 +- templates/githubpullrequest.mustache | 102 ++++-- templates/gitlabblob.mustache | 2 +- templates/hackernews.mustache | 18 + templates/json_ld_partials/movie.mustache | 6 + .../preview_error_fragment_onebox.mustache | 4 + templates/preview_error_onebox.mustache | 10 + templates/threadsstatus.mustache | 30 ++ templates/twitterstatus.mustache | 17 +- templates/wikimedia.mustache | 2 +- view.rb | 21 ++ 109 files changed, 6059 insertions(+), 79 deletions(-) create mode 100644 domain_checker.rb create mode 100644 engine.rb create mode 100644 engine/allowlisted_generic_onebox.rb create mode 100644 engine/amazon_onebox.rb create mode 100644 engine/animated_image_onebox.rb create mode 100644 engine/asciinema_onebox.rb create mode 100644 engine/audio_com_onebox.rb create mode 100644 engine/audio_onebox.rb create mode 100644 engine/audioboom_onebox.rb create mode 100644 engine/band_camp_onebox.rb create mode 100644 engine/cloud_app_onebox.rb create mode 100644 engine/coub_onebox.rb create mode 100644 engine/discourse_topic_onebox.rb create mode 100644 engine/facebook_media_onebox.rb create mode 100644 engine/five_hundred_px_onebox.rb create mode 100644 engine/flickr_onebox.rb create mode 100644 engine/flickr_shortened_onebox.rb create mode 100644 engine/gfycat_onebox.rb create mode 100644 engine/github_actions_onebox.rb create mode 100644 engine/github_blob_onebox.rb create mode 100644 engine/github_commit_onebox.rb create mode 100644 engine/github_folder_onebox.rb create mode 100644 engine/github_gist_onebox.rb create mode 100644 engine/github_issue_onebox.rb create mode 100644 engine/github_pull_request_onebox.rb create mode 100644 engine/gitlab_blob_onebox.rb create mode 100644 engine/google_calendar_onebox.rb create mode 100644 engine/google_docs_onebox.rb create mode 100644 engine/google_drive_onebox.rb create mode 100644 engine/google_maps_onebox.rb create mode 100644 engine/google_photos_onebox.rb create mode 100644 engine/google_play_app_onebox.rb create mode 100644 engine/hackernews_onebox.rb create mode 100644 engine/html.rb create mode 100644 engine/image_onebox.rb create mode 100644 engine/imgur_onebox.rb create mode 100644 engine/instagram_onebox.rb create mode 100644 engine/json.rb create mode 100644 engine/kaltura_onebox.rb create mode 100644 engine/mixcloud_onebox.rb create mode 100644 engine/motoko_onebox.rb create mode 100644 engine/opengraph_image.rb create mode 100644 engine/pastebin_onebox.rb create mode 100644 engine/pdf_onebox.rb create mode 100644 engine/pubmed_onebox.rb create mode 100644 engine/reddit_media_onebox.rb create mode 100644 engine/replit_onebox.rb create mode 100644 engine/simplecast_onebox.rb create mode 100644 engine/sketch_fab_onebox.rb create mode 100644 engine/slides_onebox.rb create mode 100644 engine/sound_cloud_onebox.rb create mode 100644 engine/stack_exchange_onebox.rb create mode 100644 engine/standard_embed.rb create mode 100644 engine/steam_store_onebox.rb create mode 100644 engine/threads_status_onebox.rb create mode 100644 engine/tiktok_onebox.rb create mode 100644 engine/trello_onebox.rb create mode 100644 engine/twitch_clips_onebox.rb create mode 100644 engine/twitch_stream_onebox.rb create mode 100644 engine/twitch_video_onebox.rb create mode 100644 engine/twitter_status_onebox.rb create mode 100644 engine/typeform_onebox.rb create mode 100644 engine/video_onebox.rb create mode 100644 engine/vimeo_onebox.rb create mode 100644 engine/wikimedia_onebox.rb create mode 100644 engine/wikipedia_onebox.rb create mode 100644 engine/wistia_onebox.rb create mode 100644 engine/xkcd_onebox.rb create mode 100644 engine/youku_onebox.rb create mode 100644 engine/youtube_onebox.rb create mode 100644 file_type_finder.rb create mode 100644 helpers.rb create mode 100644 json_ld.rb create mode 100644 layout.rb create mode 100644 layout_support.rb create mode 100644 matcher.rb create mode 100644 mixins/git_blob_onebox.rb create mode 100644 mixins/github_body.rb create mode 100644 mixins/twitch_onebox.rb create mode 100644 movie.rb create mode 100644 normalizer.rb create mode 100644 oembed.rb create mode 100644 open_graph.rb create mode 100644 preview.rb create mode 100644 sanitize_config.rb create mode 100644 status_check.rb create mode 100644 template_support.rb create mode 100644 templates/discourse_category_onebox.mustache create mode 100644 templates/discourse_topic_onebox.mustache create mode 100644 templates/discourse_user_onebox.mustache create mode 100644 templates/discoursetopic.mustache create mode 100644 templates/githubactions.mustache create mode 100644 templates/hackernews.mustache create mode 100644 templates/json_ld_partials/movie.mustache create mode 100644 templates/preview_error_fragment_onebox.mustache create mode 100644 templates/preview_error_onebox.mustache create mode 100644 templates/threadsstatus.mustache create mode 100644 view.rb diff --git a/domain_checker.rb b/domain_checker.rb new file mode 100644 index 00000000..9d78c62b --- /dev/null +++ b/domain_checker.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +module Onebox + class DomainChecker + def self.is_blocked?(hostname) + SiteSetting + .blocked_onebox_domains + &.split("|") + &.any? { |blocked| hostname == blocked || hostname.end_with?(".#{blocked}") } + end + end +end diff --git a/engine.rb b/engine.rb new file mode 100644 index 00000000..b6dfb3bc --- /dev/null +++ b/engine.rb @@ -0,0 +1,213 @@ +# frozen_string_literal: true + +module Onebox + module Engine + def self.included(object) + object.extend(ClassMethods) + end + + def self.engines + constants.select { |constant| constant.to_s =~ /Onebox\z/ }.sort.map(&method(:const_get)) + end + + def self.all_iframe_origins + engines.flat_map { |e| e.iframe_origins }.uniq.compact + end + + def self.origins_to_regexes(origins) + return [/.*/] if origins.include?("*") + + origins.map do |origin| + escaped_origin = Regexp.escape(origin) + if origin.start_with?("*.", "https://*.", "http://*.") + escaped_origin = escaped_origin.sub("\\*", '\S*') + end + + Regexp.new("\\A#{escaped_origin}", "i") + end + end + + attr_reader :url, :uri, :options, :timeout + attr :errors + + def options=(opt) + return @options if opt.nil? # make sure options provided + opt = opt.to_h if opt.instance_of?(OpenStruct) + @options.merge!(opt) + end + + def initialize(url, timeout = nil) + @errors = {} + @options = {} + class_name = self.class.name.split("::").last.to_s + + # Set the engine options extracted from global options. + self.options = Onebox.options[class_name] || {} + + @url = url + @uri = URI(url) + if always_https? + @uri.scheme = "https" + @url = @uri.to_s + end + @timeout = timeout || Onebox.options.timeout + end + + # raises error if not defined in onebox engine. + # This is the output method for an engine. + def to_html + fail NoMethodError, "Engines need to implement this method" + end + + # Some oneboxes create iframes or other complicated controls. If you're using + # a live editor with HTML preview, rendering those complicated controls can + # be slow or cause flickering. + # + # This method allows engines to produce a placeholder such as static image + # frame of a video. + # + # By default it just calls `to_html` unless implemented. + def placeholder_html + to_html + end + + private + + # raises error if not defined in onebox engine + # in each onebox, uses either Nokogiri or StandardEmbed to get raw HTML from url + def raw + fail NoMethodError, "Engines need to implement this method" + end + + # raises error if not defined in onebox engine + # in each onebox, returns hash of desired onebox content + def data + fail NoMethodError, "Engines need this method defined" + end + + def link + ::Onebox::Helpers.uri_encode(@url) + end + + def always_https? + self.class.always_https? + end + + module ClassMethods + def handles_content_type?(other) + if other && class_variable_defined?(:@@matcher_content_type) + !!(other.to_s =~ class_variable_get(:@@matcher_content_type)) + end + end + + def ===(other) + if other.kind_of?(URI) + !!(other.to_s =~ class_variable_get(:@@matcher)) + else + super + end + end + + def priority + 100 + end + + def matches_regexp(r) + class_variable_set :@@matcher, r + end + + def matches_content_type(ct) + class_variable_set :@@matcher_content_type, ct + end + + def requires_iframe_origins(*origins) + class_variable_set :@@iframe_origins, origins + end + + def iframe_origins + class_variable_defined?(:@@iframe_origins) ? class_variable_get(:@@iframe_origins) : [] + end + + # calculates a name for onebox using the class name of engine + def onebox_name + name.split("::").last.downcase.gsub(/onebox/, "") + end + + def always_https + @https = true + end + + def always_https? + defined?(@https) ? @https : false + end + end + end +end + +require_relative "helpers" +require_relative "layout_support" +require_relative "file_type_finder" +require_relative "engine/standard_embed" +require_relative "engine/html" +require_relative "engine/json" +require_relative "engine/amazon_onebox" +require_relative "engine/github_issue_onebox" +require_relative "engine/github_blob_onebox" +require_relative "engine/github_commit_onebox" +require_relative "engine/github_folder_onebox" +require_relative "engine/github_gist_onebox" +require_relative "engine/github_pull_request_onebox" +require_relative "engine/google_calendar_onebox" +require_relative "engine/google_docs_onebox" +require_relative "engine/google_maps_onebox" +require_relative "engine/google_play_app_onebox" +require_relative "engine/image_onebox" +require_relative "engine/video_onebox" +require_relative "engine/audio_onebox" +require_relative "engine/threads_status_onebox" +require_relative "engine/stack_exchange_onebox" +require_relative "engine/twitter_status_onebox" +require_relative "engine/wikimedia_onebox" +require_relative "engine/wikipedia_onebox" +require_relative "engine/youtube_onebox" +require_relative "engine/youku_onebox" +require_relative "engine/allowlisted_generic_onebox" +require_relative "engine/pubmed_onebox" +require_relative "engine/sound_cloud_onebox" +require_relative "engine/imgur_onebox" +require_relative "engine/pastebin_onebox" +require_relative "engine/slides_onebox" +require_relative "engine/xkcd_onebox" +require_relative "engine/animated_image_onebox" +require_relative "engine/gfycat_onebox" +require_relative "engine/typeform_onebox" +require_relative "engine/vimeo_onebox" +require_relative "engine/steam_store_onebox" +require_relative "engine/sketch_fab_onebox" +require_relative "engine/audioboom_onebox" +require_relative "engine/replit_onebox" +require_relative "engine/asciinema_onebox" +require_relative "engine/mixcloud_onebox" +require_relative "engine/band_camp_onebox" +require_relative "engine/coub_onebox" +require_relative "engine/flickr_onebox" +require_relative "engine/flickr_shortened_onebox" +require_relative "engine/five_hundred_px_onebox" +require_relative "engine/pdf_onebox" +require_relative "engine/twitch_clips_onebox" +require_relative "engine/twitch_stream_onebox" +require_relative "engine/twitch_video_onebox" +require_relative "engine/trello_onebox" +require_relative "engine/cloud_app_onebox" +require_relative "engine/wistia_onebox" +require_relative "engine/simplecast_onebox" +require_relative "engine/instagram_onebox" +require_relative "engine/gitlab_blob_onebox" +require_relative "engine/google_photos_onebox" +require_relative "engine/kaltura_onebox" +require_relative "engine/reddit_media_onebox" +require_relative "engine/google_drive_onebox" +require_relative "engine/facebook_media_onebox" +require_relative "engine/hackernews_onebox" +require_relative "engine/motoko_onebox" +require_relative "engine/tiktok_onebox" diff --git a/engine/allowlisted_generic_onebox.rb b/engine/allowlisted_generic_onebox.rb new file mode 100644 index 00000000..23510259 --- /dev/null +++ b/engine/allowlisted_generic_onebox.rb @@ -0,0 +1,310 @@ +# frozen_string_literal: true + +require "htmlentities" +require "ipaddr" + +module Onebox + module Engine + class AllowlistedGenericOnebox + include Engine + include StandardEmbed + include LayoutSupport + + def self.priority + 200 + end + + # Often using the `html` attribute is not what we want, like for some blogs that + # include the entire page HTML. However for some providers like Flickr it allows us + # to return gifv and galleries. + def self.default_html_providers + %w[Flickr Meetup] + end + + def self.html_providers + @html_providers ||= default_html_providers.dup + end + + def self.html_providers=(new_provs) + @html_providers = new_provs + end + + # A re-written URL converts http:// -> https:// + def self.rewrites + @rewrites ||= https_hosts.dup + end + + def self.rewrites=(new_list) + @rewrites = new_list + end + + def self.https_hosts + %w[slideshare.net dailymotion.com livestream.com imgur.com flickr.com] + end + + def self.article_html_hosts + %w[imdb.com] + end + + def self.host_matches(uri, list) + !!list.find { |h| /(^|\.)#{Regexp.escape(h)}$/.match(uri.host) } + end + + def self.allowed_twitter_labels + ["brand", "price", "usd", "cad", "reading time", "likes"] + end + + def self.===(other) + if other.is_a?(URI) + ( + begin + IPAddr.new(other.hostname) + rescue StandardError + nil + end + ).nil? + else + true + end + end + + def to_html + rewrite_https(generic_html) + end + + def placeholder_html + return article_html if (is_article? || force_article_html?) + return image_html if is_image? + if !SiteSetting.enable_diffhtml_preview? && (is_video? || is_card?) + return Onebox::Helpers.video_placeholder_html + end + if !SiteSetting.enable_diffhtml_preview? && is_embedded? + return Onebox::Helpers.generic_placeholder_html + end + to_html + end + + def verified_data + data + end + + def data + @data ||= + begin + html_entities = HTMLEntities.new + d = { link: link }.merge(raw) + + if d[:title].present? + d[:title] = html_entities.decode(Onebox::Helpers.truncate(d[:title], 80)) + end + + d[:description] ||= d[:summary] + if d[:description].present? + d[:description] = html_entities.decode(Onebox::Helpers.truncate(d[:description], 250)) + end + + if d[:site_name].present? + d[:domain] = html_entities.decode(Onebox::Helpers.truncate(d[:site_name], 80)) + elsif d[:domain].present? + d[:domain] = "http://#{d[:domain]}" unless d[:domain] =~ %r{^https?://} + d[:domain] = begin + URI(d[:domain]).host.to_s.sub(/^www\./, "") + rescue StandardError + nil + end + end + + # prefer secure URLs + d[:image] = d[:image_secure_url] || d[:image_url] || d[:thumbnail_url] || d[:image] + d[:image] = Onebox::Helpers.get_absolute_image_url(d[:image], @url) + d[:image] = Onebox::Helpers.normalize_url_for_output(html_entities.decode(d[:image])) + d[:image] = nil if d[:image].blank? + + d[:video] = d[:video_secure_url] || d[:video_url] || d[:video] + d[:video] = nil if d[:video].blank? + + d[:published_time] = d[:article_published_time] if d[:article_published_time].present? + + if d[:published_time].present? + d[:article_published_time] = Time.parse(d[:published_time]).strftime("%-d %b %y") + d[:article_published_time_title] = Time.parse(d[:published_time]).strftime( + "%I:%M%p - %d %B %Y", + ) + end + + # Twitter labels + if d[:label1].present? && d[:data1].present? && + !!AllowlistedGenericOnebox.allowed_twitter_labels.find { |l| + d[:label1] =~ /#{l}/i + } + d[:label_1] = Onebox::Helpers.truncate(d[:label1]) + d[:data_1] = Onebox::Helpers.truncate(d[:data1]) + end + if d[:label2].present? && d[:data2].present? && + !!AllowlistedGenericOnebox.allowed_twitter_labels.find { |l| + d[:label2] =~ /#{l}/i + } + if d[:label_1].blank? + d[:label_1] = Onebox::Helpers.truncate(d[:label2]) + d[:data_1] = Onebox::Helpers.truncate(d[:data2]) + else + d[:label_2] = Onebox::Helpers.truncate(d[:label2]) + d[:data_2] = Onebox::Helpers.truncate(d[:data2]) + end + end + + if d[:label_1].blank? && d[:price_amount].present? && d[:price_currency].present? + d[:label_1] = "Price" + d[:data_1] = Onebox::Helpers.truncate( + "#{d[:price_currency].strip} #{d[:price_amount].strip}", + ) + end + + skip_missing_tags = [:video] + d.each do |k, v| + next if skip_missing_tags.include?(k) + if v == nil || v == "" + errors[k] ||= [] + errors[k] << "is blank" + end + end + + d + end + end + + private + + def rewrite_https(html) + return unless html + if AllowlistedGenericOnebox.host_matches(uri, AllowlistedGenericOnebox.rewrites) + html = html.gsub("http://", "https://") + end + html + end + + def generic_html + return article_html if (is_article? || force_article_html?) + return video_html if is_video? + return image_html if is_image? + return embedded_html if is_embedded? + return card_html if is_card? + + article_html if (has_text? || is_image_article?) + end + + def is_card? + data[:card] == "player" && data[:player] =~ URI.regexp && + options[:allowed_iframe_regexes]&.any? { |r| data[:player] =~ r } + end + + def is_article? + (data[:type] =~ /article/ || data[:asset_type] =~ /article/) && has_text? + end + + def has_text? + has_title? && data[:description].present? + end + + def has_title? + data[:title].present? + end + + def is_image_article? + has_title? && has_image? + end + + def is_image? + data[:type] =~ /photo|image/ && data[:type] !~ /photostream/ && has_image? + end + + def has_image? + data[:image].present? + end + + def is_video? + data[:type] =~ %r{^video[/\.]} && data[:video_type] == "video/mp4" && data[:video].present? # Many sites include 'videos' with text/html types (i.e. iframes) + end + + def is_embedded? + return false unless data[:html] && data[:height] + return true if AllowlistedGenericOnebox.html_providers.include?(data[:provider_name]) + return false unless data[:html]["iframe"] + + fragment = Nokogiri::HTML5.fragment(data[:html]) + src = fragment.at_css("iframe")&.[]("src") + options[:allowed_iframe_regexes]&.any? { |r| src =~ r } + end + + def force_article_html? + AllowlistedGenericOnebox.host_matches(uri, AllowlistedGenericOnebox.article_html_hosts) && + (has_text? || is_image_article?) + end + + def card_html + escaped_url = ::Onebox::Helpers.normalize_url_for_output(data[:player]) + + <<~HTML + + HTML + end + + def article_html + if data[:image] + data[:thumbnail_width] ||= data[:image_width] || data[:width] + data[:thumbnail_height] ||= data[:image_height] || data[:height] + end + + layout.to_html + end + + def image_html + return if data[:image].blank? + + escaped_src = ::Onebox::Helpers.normalize_url_for_output(data[:image]) + + alt = data[:description] || data[:title] + width = data[:image_width] || data[:thumbnail_width] || data[:width] + height = data[:image_height] || data[:thumbnail_height] || data[:height] + + "#{alt}" + end + + def video_html + escaped_video_src = ::Onebox::Helpers.normalize_url_for_output(data[:video]) + escaped_image_src = ::Onebox::Helpers.normalize_url_for_output(data[:image]) + + <<-HTML + + HTML + end + + def embedded_html + fragment = Nokogiri::HTML5.fragment(data[:html]) + fragment.css("img").each { |img| img["class"] = "thumbnail" } + if iframe = fragment.at_css("iframe") + iframe.remove_attribute("style") + iframe["width"] = data[:width] || "100%" + iframe["height"] = data[:height] + iframe["scrolling"] = "no" + iframe["frameborder"] = "0" + end + fragment.to_html + end + end + end +end diff --git a/engine/amazon_onebox.rb b/engine/amazon_onebox.rb new file mode 100644 index 00000000..c7b7d236 --- /dev/null +++ b/engine/amazon_onebox.rb @@ -0,0 +1,254 @@ +# frozen_string_literal: true + +require "json" +require "onebox/open_graph" + +module Onebox + module Engine + class AmazonOnebox + include Engine + include LayoutSupport + include HTML + + always_https + matches_regexp( + %r{^https?://(?:www\.)?(?:smile\.)?(amazon|amzn)\.(?com|ca|de|it|es|fr|co\.jp|co\.uk|cn|in|com\.br|com\.mx|nl|pl|sa|sg|se|com\.tr|ae)/}, + ) + + def url + @raw ||= nil + + # If possible, fetch the cached HTML body immediately so we can + # try to grab the canonical URL from that document, + # rather than guess at the best URL structure to use + if !@raw && has_cached_body + @raw = Onebox::Helpers.fetch_html_doc(@url, http_params, body_cacher) + end + + if @raw + canonical_link = @raw.at('//link[@rel="canonical"]/@href') + return canonical_link.to_s if canonical_link + end + + if match && match[:id] + id = + Addressable::URI.encode_component(match[:id], Addressable::URI::CharacterClasses::PATH) + return "https://www.amazon.#{tld}/dp/#{id}" + end + + @url + end + + def tld + @tld ||= @@matcher.match(@url)["tld"] + end + + def http_params + { "User-Agent" => @options[:user_agent] } if @options && @options[:user_agent] + end + + def to_html(ignore_errors = false) + unless ignore_errors + verified_data # forces a check for missing fields + return "" unless errors.empty? + end + + super() + end + + def placeholder_html + to_html(true) + end + + def verified_data + @verified_data ||= + begin + result = data + + required_tags = %i[title description] + required_tags.each do |tag| + if result[tag].blank? + errors[tag] ||= [] + errors[tag] << "is blank" + end + end + + result + end + + @verified_data + end + + private + + def has_cached_body + body_cacher.respond_to?("cache_response_body?") && + body_cacher.cache_response_body?(uri.to_s) && + body_cacher.cached_response_body_exists?(uri.to_s) + end + + def match + @match ||= @url.match(%r{(?:d|g)p/(?:product/|video/detail/)?(?[A-Z0-9]+)(?:/|\?|$)}mi) + end + + def image + if (main_image = raw.css("#main-image")) && main_image.any? + attributes = main_image.first.attributes + + if attributes["data-a-hires"] + return attributes["data-a-hires"].to_s + elsif attributes["data-a-dynamic-image"] + return ::JSON.parse(attributes["data-a-dynamic-image"].value).keys.first + end + end + + if (landing_image = raw.css("#landingImage")) && landing_image.any? + attributes = landing_image.first.attributes + + if attributes["data-old-hires"] + return attributes["data-old-hires"].to_s + else + return landing_image.first["src"].to_s + end + end + + if (ebook_image = raw.css("#ebooksImgBlkFront")) && ebook_image.any? + ::JSON.parse(ebook_image.first.attributes["data-a-dynamic-image"].value).keys.first + end + end + + def price + # get item price (Amazon markup is inconsistent, deal with it) + if raw.css("#priceblock_ourprice .restOfPrice")[0] && + raw.css("#priceblock_ourprice .restOfPrice")[0].inner_text + "#{raw.css("#priceblock_ourprice .restOfPrice")[0].inner_text}#{raw.css("#priceblock_ourprice .buyingPrice")[0].inner_text}.#{raw.css("#priceblock_ourprice .restOfPrice")[1].inner_text}" + elsif raw.css("#priceblock_dealprice") && + (dealprice = raw.css("#priceblock_dealprice span")[0]) + dealprice.inner_text + elsif !raw.css("#priceblock_ourprice").inner_text.empty? + raw.css("#priceblock_ourprice").inner_text + else + result = raw.css("#corePrice_feature_div .a-price .a-offscreen").first&.inner_text + if result.blank? + result = raw.css(".mediaMatrixListItem.a-active .a-color-price").inner_text + end + + result + end + end + + def multiple_authors(authors_xpath) + raw.xpath(authors_xpath).map { |a| a.inner_text.strip }.join(", ") + end + + def data + og = ::Onebox::OpenGraph.new(raw) + + if raw.at_css("#dp.book_mobile") # printed books + title = raw.at("h1#title")&.inner_text + authors = + ( + if raw.at_css("#byline_secondary_view_div") + multiple_authors( + "//div[@id='byline_secondary_view_div']//span[@class='a-text-bold']", + ) + else + raw.at("#byline")&.inner_text + end + ) + rating = + raw.at("#averageCustomerReviews_feature_div .a-icon")&.inner_text || + raw.at("#cmrsArcLink .a-icon")&.inner_text + + table_xpath = + "//div[@id='productDetails_secondary_view_div']//table[@id='productDetails_techSpec_section_1']" + isbn = raw.xpath("#{table_xpath}//tr[8]//td").inner_text.strip + + # if ISBN is misplaced or absent it's hard to find out which data is + # available and where to find it so just set it all to nil + if /^\d(\-?\d){12}$/.match(isbn) + publisher = raw.xpath("#{table_xpath}//tr[1]//td").inner_text.strip + published = raw.xpath("#{table_xpath}//tr[2]//td").inner_text.strip + book_length = raw.xpath("#{table_xpath}//tr[6]//td").inner_text.strip + else + isbn = publisher = published = book_length = nil + end + + result = { + link: url, + title: title, + by_info: authors, + image: og.image || image, + description: raw.at("#productDescription")&.inner_text, + rating: "#{rating}#{", " if rating && (!isbn&.empty? || !price&.empty?)}", + price: price, + isbn_asin_text: "ISBN", + isbn_asin: isbn, + publisher: publisher, + published: "#{published}#{", " if published && !price&.empty?}", + } + elsif raw.at_css("#dp.ebooks_mobile") # ebooks + title = raw.at("#ebooksTitle")&.inner_text + authors = + ( + if raw.at_css("#a-popover-mobile-udp-contributor-popover-id") + multiple_authors( + "//div[@id='a-popover-mobile-udp-contributor-popover-id']//span[contains(@class,'a-text-bold')]", + ) + else + (raw.at("#byline")&.inner_text&.strip || raw.at("#bylineInfo")&.inner_text&.strip) + end + ) + rating = + raw.at("#averageCustomerReviews_feature_div .a-icon")&.inner_text || + raw.at("#cmrsArcLink .a-icon")&.inner_text || + raw.at("#acrCustomerReviewLink .a-icon")&.inner_text + + table_xpath = "//div[@id='detailBullets_secondary_view_div']//ul" + asin = raw.xpath("#{table_xpath}//li[4]/span/span[2]").inner_text + + # if ASIN is misplaced or absent it's hard to find out which data is + # available and where to find it so just set it all to nil + if /^[0-9A-Z]{10}$/.match(asin) + publisher = raw.xpath("#{table_xpath}//li[2]/span/span[2]").inner_text + published = raw.xpath("#{table_xpath}//li[1]/span/span[2]").inner_text + else + asin = publisher = published = nil + end + + result = { + link: url, + title: title, + by_info: authors, + image: og.image || image, + description: raw.at("#productDescription")&.inner_text, + rating: "#{rating}#{", " if rating && (!asin&.empty? || !price&.empty?)}", + price: price, + isbn_asin_text: "ASIN", + isbn_asin: asin, + publisher: publisher, + published: "#{published}#{", " if published && !price&.empty?}", + } + else + title = og.title || CGI.unescapeHTML(raw.css("title").inner_text) + result = { link: url, title: title, image: og.image || image, price: price } + + result[:by_info] = raw.at("#by-line") + result[:by_info] = Onebox::Helpers.clean(result[:by_info].inner_html) if result[:by_info] + + summary = raw.at("#productDescription") + + description = og.description || summary&.inner_text&.strip + description = raw.css("meta[name=description]").first&.[]("content") if description.blank? + result[:description] = CGI.unescapeHTML( + Onebox::Helpers.truncate(description, 250), + ) if description + end + + result[:price] = nil if result[:price].start_with?("$0") || result[:price] == 0 + + result + end + end + end +end diff --git a/engine/animated_image_onebox.rb b/engine/animated_image_onebox.rb new file mode 100644 index 00000000..a960ac5e --- /dev/null +++ b/engine/animated_image_onebox.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class AnimatedImageOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://.*(giphy\.com|gph\.is|tenor\.com)/}) + always_https + + def to_html + og = get_opengraph + if og.image + "" + else + escaped_url = ::Onebox::Helpers.normalize_url_for_output(@url) + "" + end + end + end + end +end diff --git a/engine/asciinema_onebox.rb b/engine/asciinema_onebox.rb new file mode 100644 index 00000000..f6c27108 --- /dev/null +++ b/engine/asciinema_onebox.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class AsciinemaOnebox + include Engine + include StandardEmbed + + always_https + matches_regexp(/^https?:\/\/asciinema\.org\/a\/[\p{Alnum}_\-]+$/) + + def to_html + "" + end + + def placeholder_html + "" + end + + private + + def match + @match ||= @url.match(/asciinema\.org\/a\/(?[\p{Alnum}_\-]+)$/) + end + end + end +end diff --git a/engine/audio_com_onebox.rb b/engine/audio_com_onebox.rb new file mode 100644 index 00000000..faa3faf0 --- /dev/null +++ b/engine/audio_com_onebox.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class AudioComOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://audio\.com}) + requires_iframe_origins "https://audio.com" + always_https + + def to_html + oembed = get_oembed + oembed.html.gsub("visual=true", "visual=false") + end + + def placeholder_html + oembed = get_oembed + return if oembed.thumbnail_url.blank? + "" + end + + protected + + def get_oembed_url + oembed_url = "https://api.audio.com/oembed?url=#{url}" + oembed_url += "&maxheight=228" unless url["/collections/"] + oembed_url + end + end + end +end diff --git a/engine/audio_onebox.rb b/engine/audio_onebox.rb new file mode 100644 index 00000000..8a41b100 --- /dev/null +++ b/engine/audio_onebox.rb @@ -0,0 +1,30 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class AudioOnebox + include Engine + + matches_regexp(%r{^(https?:)?//.*\.(mp3|ogg|opus|wav|m4a)(\?.*)?$}i) + + def always_https? + AllowlistedGenericOnebox.host_matches(uri, AllowlistedGenericOnebox.https_hosts) + end + + def to_html + escaped_url = ::Onebox::Helpers.normalize_url_for_output(@url) + + <<-HTML + + HTML + end + + def placeholder_html + SiteSetting.enable_diffhtml_preview ? to_html : ::Onebox::Helpers.audio_placeholder_html + end + end + end +end diff --git a/engine/audioboom_onebox.rb b/engine/audioboom_onebox.rb new file mode 100644 index 00000000..daf26f6a --- /dev/null +++ b/engine/audioboom_onebox.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class AudioboomOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://audioboom\.com/posts/\d+}) + always_https + + def placeholder_html + oembed = get_oembed + + <<-HTML + + HTML + end + + def to_html + get_oembed.html + end + end + end +end diff --git a/engine/band_camp_onebox.rb b/engine/band_camp_onebox.rb new file mode 100644 index 00000000..937826ce --- /dev/null +++ b/engine/band_camp_onebox.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class BandCampOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://.*\.bandcamp\.com/(album|track)/}) + always_https + requires_iframe_origins "https://bandcamp.com" + + def placeholder_html + og = get_opengraph + "" + end + + def to_html + og = get_opengraph + escaped_src = og.video_secure_url || og.video + + <<-HTML + + HTML + end + end + end +end diff --git a/engine/cloud_app_onebox.rb b/engine/cloud_app_onebox.rb new file mode 100644 index 00000000..2c076381 --- /dev/null +++ b/engine/cloud_app_onebox.rb @@ -0,0 +1,53 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class CloudAppOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://cl\.ly}) + always_https + + def to_html + og = get_opengraph + + if !og.image.nil? + image_html(og) + elsif og.title.to_s[/\.(mp4|ogv|webm)$/] + video_html(og) + else + link_html(og) + end + end + + private + + def link_html(og) + <<-HTML + + #{og.title} + + HTML + end + + def video_html(og) + direct_src = ::Onebox::Helpers.normalize_url_for_output("#{og.get(:url)}/#{og.title}") + + <<-HTML + + HTML + end + + def image_html(og) + <<-HTML + + CloudApp + + HTML + end + end + end +end diff --git a/engine/coub_onebox.rb b/engine/coub_onebox.rb new file mode 100644 index 00000000..5cac6c5d --- /dev/null +++ b/engine/coub_onebox.rb @@ -0,0 +1,22 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class CoubOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://coub\.com/view/}) + always_https + + def placeholder_html + oembed = get_oembed + "" + end + + def to_html + get_oembed.html + end + end + end +end diff --git a/engine/discourse_topic_onebox.rb b/engine/discourse_topic_onebox.rb new file mode 100644 index 00000000..0247299d --- /dev/null +++ b/engine/discourse_topic_onebox.rb @@ -0,0 +1,59 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class DiscourseTopicOnebox + include Engine + include StandardEmbed + include LayoutSupport + + matches_regexp(%r{/t/.*(/\d+)?}) + + def data + @data ||= { + categories: categories, + link: link, + article_published_time: published_time.strftime("%-d %b %y"), + article_published_time_title: published_time.strftime("%I:%M%p - %d %B %Y"), + domain: html_entities.decode(raw[:site_name].truncate(80, separator: " ")), + description: html_entities.decode(raw[:description].truncate(250, separator: " ")), + title: html_entities.decode(raw[:title].truncate(80, separator: " ")), + image: image, + render_tags?: render_tags?, + render_category_block?: render_category_block?, + }.reverse_merge(raw) + end + alias verified_data data + + private + + def categories + Array + .wrap(raw[:article_sections]) + .map + .with_index { |name, index| { name: name, color: raw[:article_section_colors][index] } } + end + + def published_time + @published_time ||= Time.parse(raw[:published_time]) + end + + def html_entities + @html_entities ||= HTMLEntities.new + end + + def image + image = Onebox::Helpers.get_absolute_image_url(raw[:image], @url) + Onebox::Helpers.normalize_url_for_output(html_entities.decode(image)) + end + + def render_tags? + raw[:article_tags].present? + end + + def render_category_block? + render_tags? || categories.present? + end + end + end +end diff --git a/engine/facebook_media_onebox.rb b/engine/facebook_media_onebox.rb new file mode 100644 index 00000000..9e676e18 --- /dev/null +++ b/engine/facebook_media_onebox.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class FacebookMediaOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://.*\.facebook\.com/(\w+)/(videos|\?).*}) + always_https + requires_iframe_origins "https://www.facebook.com" + + def to_html + metadata = get_twitter + if metadata.present? && metadata[:card] == "player" && metadata[:player].present? + <<-HTML + + HTML + else + html = Onebox::Engine::AllowlistedGenericOnebox.new(@url, @timeout).to_html + html.presence + end + end + end + end +end diff --git a/engine/five_hundred_px_onebox.rb b/engine/five_hundred_px_onebox.rb new file mode 100644 index 00000000..d2aab48e --- /dev/null +++ b/engine/five_hundred_px_onebox.rb @@ -0,0 +1,18 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class FiveHundredPxOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://500px\.com/photo/\d+/}) + always_https + + def to_html + og = get_opengraph + "" + end + end + end +end diff --git a/engine/flickr_onebox.rb b/engine/flickr_onebox.rb new file mode 100644 index 00000000..435a53e0 --- /dev/null +++ b/engine/flickr_onebox.rb @@ -0,0 +1,52 @@ +# frozen_string_literal: true + +require_relative "./opengraph_image" + +module Onebox + module Engine + class FlickrOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://www\.flickr\.com/photos/}) + always_https + + def to_html + og = get_opengraph + return album_html(og) if og.url =~ %r{/sets/} + return image_html(og) if !og.image.nil? + nil + end + + private + + def album_html(og) + escaped_url = ::Onebox::Helpers.normalize_url_for_output(url) + album_title = "[Album] #{og.title}" + + <<-HTML + + HTML + end + + def image_html(og) + escaped_url = ::Onebox::Helpers.normalize_url_for_output(url) + + <<-HTML + + Imgur + + HTML + end + end + end +end diff --git a/engine/flickr_shortened_onebox.rb b/engine/flickr_shortened_onebox.rb new file mode 100644 index 00000000..0a6baf13 --- /dev/null +++ b/engine/flickr_shortened_onebox.rb @@ -0,0 +1,16 @@ +# frozen_string_literal: true + +require_relative "./opengraph_image" + +module Onebox + module Engine + class FlickrShortenedOnebox + include Engine + include StandardEmbed + include OpengraphImage + + matches_regexp(%r{^https?://flic\.kr/p/}) + always_https + end + end +end diff --git a/engine/gfycat_onebox.rb b/engine/gfycat_onebox.rb new file mode 100644 index 00000000..29166541 --- /dev/null +++ b/engine/gfycat_onebox.rb @@ -0,0 +1,118 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class GfycatOnebox + include Engine + include JSON + + matches_regexp(%r{^https?://gfycat\.com/}) + always_https + + # This engine should have priority over AllowlistedGenericOnebox. + def self.priority + 1 + end + + def to_html + <<-HTML + + HTML + end + + def placeholder_html + <<-HTML + +
+ #{data[:name]} +
+ HTML + end + + private + + def match + @match ||= @url.match(%r{^https?://gfycat\.com/(gifs/detail/)?(?.+)}) + end + + def og_data + return @og_data if defined?(@og_data) + + response = + begin + Onebox::Helpers.fetch_response(url, redirect_limit: 10) + rescue StandardError + nil + end + page = Nokogiri.HTML(response) + script = page.at_css('script[type="application/ld+json"]') + + if json_string = script&.text + @og_data = ::MultiJson.load(json_string, symbolize_keys: true) + else + @og_data = {} + end + end + + def data + return @data if defined?(@data) + + @data = { + name: match[:name], + title: og_data[:headline] || "No Title", + author: og_data[:author], + url: @url, + } + + if keywords = og_data[:keywords]&.split(",") + @data[:keywords] = keywords + .map { |keyword| "##{keyword}" } + .join(" ") + end + + if og_data[:video] + content_url = ::Onebox::Helpers.normalize_url_for_output(og_data[:video][:contentUrl]) + video_url = Pathname.new(content_url) + @data[:webmUrl] = video_url.sub_ext(".webm").to_s + @data[:mp4Url] = video_url.sub_ext(".mp4").to_s + + thumbnail_url = ::Onebox::Helpers.normalize_url_for_output(og_data[:video][:thumbnailUrl]) + @data[:posterUrl] = thumbnail_url + + @data[:width] = og_data[:video][:width] + @data[:height] = og_data[:video][:height] + end + + @data + end + end + end +end diff --git a/engine/github_actions_onebox.rb b/engine/github_actions_onebox.rb new file mode 100644 index 00000000..c6309b3c --- /dev/null +++ b/engine/github_actions_onebox.rb @@ -0,0 +1,96 @@ +# frozen_string_literal: true + +require_relative "../mixins/github_body" + +module Onebox + module Engine + class GithubActionsOnebox + include Engine + include LayoutSupport + include JSON + + matches_regexp( + %r{^https?://(?:www\.)?(?:(?:\w)+\.)?github\.com/(?.+)/(?.+)/(actions/runs/[[:digit:]]+|pull/[[:digit:]]*/checks\?check_run_id=[[:digit:]]+)}, + ) + always_https + + def url + if type == :actions_run + "https://api.github.com/repos/#{match[:org]}/#{match[:repo]}/actions/runs/#{match[:run_id]}" + elsif type == :pr_run + "https://api.github.com/repos/#{match[:org]}/#{match[:repo]}/check-runs/#{match[:check_run_id]}" + end + end + + def self.priority + 90 # overlaps with GithubPullRequestOnebox + end + + private + + def match_url + return if defined?(@match) && defined?(@type) + + if match = + @url.match( + %r{^https?://(?:www\.)?(?:(?:\w)+\.)?github\.com/(?.+)/(?.+)/actions/runs/(?[[:digit:]]+)}, + ) + @match = match + @type = :actions_run + end + + if match = + @url.match( + %r{^https?://(?:www\.)?(?:(?:\w)+\.)?github\.com/(?.+)/(?.+)/pull/(?[[:digit:]]*)/checks\?check_run_id=(?[[:digit:]]+)}, + ) + @match = match + @type = :pr_run + end + end + + def match + return @match if defined?(@match) + + match_url + @match + end + + def type + return @type if defined?(@type) + + match_url + @type + end + + def data + status = "unknown" + if raw["status"] == "completed" + if raw["conclusion"] == "success" + status = "success" + elsif raw["conclusion"] == "failure" + status = "failure" + end + elsif raw["status"] == "in_progress" + status = "pending" + end + + title = + if type == :actions_run + raw["head_commit"]["message"].lines.first + elsif type == :pr_run + pr_url = + "https://api.github.com/repos/#{match[:org]}/#{match[:repo]}/pulls/#{match[:pr_id]}" + ::MultiJson.load(URI.parse(pr_url).open(read_timeout: timeout))["title"] + end + + { + :link => @url, + :title => title, + :name => raw["name"], + :run_number => raw["run_number"], + status => true, + } + end + end + end +end diff --git a/engine/github_blob_onebox.rb b/engine/github_blob_onebox.rb new file mode 100644 index 00000000..4b4ed67f --- /dev/null +++ b/engine/github_blob_onebox.rb @@ -0,0 +1,40 @@ +# frozen_string_literal: true + +require_relative "../mixins/git_blob_onebox" + +module Onebox + module Engine + class GithubBlobOnebox + def self.git_regexp + %r{^https?://(www\.)?github\.com.*/blob/} + end + + def self.onebox_name + "githubblob" + end + + include Onebox::Mixins::GitBlobOnebox + + def i18n + { + binary_file: I18n.t("onebox.github.binary_file"), + truncated_file: I18n.t("onebox.github.truncated_file"), + show_original: I18n.t("onebox.github.show_original"), + requires_iframe: I18n.t("onebox.github.requires_iframe"), + } + end + + def raw_regexp + %r{github\.com/(?[^/]+)/(?[^/]+)/blob/(?[^/]+)/(?[^#]+)(#(L(?[^-]*)(-L(?.*))?))?}mi + end + + def raw_template(m) + "https://raw.githubusercontent.com/#{m[:user]}/#{m[:repo]}/#{m[:sha1]}/#{m[:file]}" + end + + def title + Sanitize.fragment(Onebox::Helpers.uri_unencode(link).sub(%r{^https?\://github\.com/}, "")) + end + end + end +end diff --git a/engine/github_commit_onebox.rb b/engine/github_commit_onebox.rb new file mode 100644 index 00000000..f7c81a10 --- /dev/null +++ b/engine/github_commit_onebox.rb @@ -0,0 +1,56 @@ +# frozen_string_literal: true + +require_relative "../mixins/github_body" + +module Onebox + module Engine + class GithubCommitOnebox + include Engine + include LayoutSupport + include JSON + include Onebox::Mixins::GithubBody + + matches_regexp(%r{^https?://(?:www\.)?(?:(?:\w)+\.)?(github)\.com(?:/)?(?:.)*/commit/}) + always_https + + def url + "https://api.github.com/repos/#{match[:owner]}/#{match[:repository]}/commits/#{match[:sha]}" + end + + private + + def match + return @match if defined?(@match) + + @match = + @url.match(%{github\.com/(?[^/]+)/(?[^/]+)/commit/(?[^/]+)}) + @match ||= + @url.match( + %{github\.com/(?[^/]+)/(?[^/]+)/pull/(?[^/]+)/commit/(?[^/]+)}, + ) + + @match + end + + def data + result = raw.clone + + lines = result["commit"]["message"].split("\n") + result["title"] = lines.first + result["body"], result["excerpt"] = compute_body(lines[1..lines.length].join("\n")) + + committed_at = Time.parse(result["commit"]["committer"]["date"]) + result["committed_at"] = committed_at.strftime("%I:%M%p - %d %b %y %Z") + result["committed_at_date"] = committed_at.strftime("%F") + result["committed_at_time"] = committed_at.strftime("%T") + + result["link"] = link + ulink = URI(link) + result["domain"] = "#{ulink.host}/#{ulink.path.split("/")[1]}/#{ulink.path.split("/")[2]}" + result["i18n"] = { committed: I18n.t("onebox.github.committed") } + + result + end + end + end +end diff --git a/engine/github_folder_onebox.rb b/engine/github_folder_onebox.rb new file mode 100644 index 00000000..d77a27ca --- /dev/null +++ b/engine/github_folder_onebox.rb @@ -0,0 +1,68 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class GithubFolderOnebox + include Engine + include StandardEmbed + include LayoutSupport + + matches_regexp(/^https?:\/\/(?:www\.)?(?:(?:\w)+\.)?(github)\.com[\:\d]*(\/[^\/]+){2}\/tree/) + always_https + + private + + def data + og = get_opengraph + + max_length = 250 + + display_path = extract_path(og.url, max_length) + display_description = clean_description(og.description, og.title, max_length) + + title = og.title + + fragment = Addressable::URI.parse(url).fragment + if fragment + fragment = Addressable::URI.unencode(fragment) + + # For links to markdown and rdoc + if html_doc.css(".Box.md, .Box.rdoc").present? + node = html_doc.css("a.anchor").find { |n| n["href"] == "##{fragment}" } + subtitle = node&.parent&.text + end + + title = "#{title} - #{subtitle}" if subtitle + end + + { + link: url, + title: Onebox::Helpers.truncate(title, 250), + path: display_path, + description: display_description, + favicon: get_favicon, + } + end + + def extract_path(root, max_length) + path = url.split("#")[0].split("?")[0] + path = path["#{root}/tree/".length..-1] + + return unless path + + path.length > max_length ? path[-max_length..-1] : path + end + + def clean_description(description, title, max_length) + return unless description + + desc_end = " - #{title}" + if description[-desc_end.length..-1] == desc_end + description = description[0...-desc_end.length] + end + + Onebox::Helpers.truncate(description, max_length) + end + end + end +end diff --git a/engine/github_gist_onebox.rb b/engine/github_gist_onebox.rb new file mode 100644 index 00000000..5ddcb2bb --- /dev/null +++ b/engine/github_gist_onebox.rb @@ -0,0 +1,88 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class GithubGistOnebox + include Engine + include LayoutSupport + include JSON + + MAX_FILES = 3 + + matches_regexp(%r{^http(?:s)?://gist\.(?:(?:\w)+\.)?(github)\.com(?:/)?}) + always_https + + def url + "https://api.github.com/gists/#{match[:sha]}" + end + + private + + def data + @data ||= { + title: "gist.github.com", + link: link, + gist_files: gist_files.take(MAX_FILES), + truncated_files?: truncated_files?, + i18n: i18n, + } + end + + def i18n + { + truncated_file: I18n.t("onebox.github.truncated_file"), + more_than_three_files: I18n.t("onebox.github.more_than_three_files"), + show_original: I18n.t("onebox.github.show_original"), + } + end + + def truncated_files? + gist_files.size > MAX_FILES + end + + def gist_files + return [] unless gist_api + + @gist_files ||= gist_api["files"].values.map { |file_json| GistFile.new(file_json) } + end + + def gist_api + @raw ||= raw.clone + rescue OpenURI::HTTPError + # The Gist API rate limit of 60 requests per hour was reached. + nil + end + + def match + @match ||= @url.match(%r{gist\.github\.com/([^/]+/)?(?[0-9a-f]+)}) + end + + class GistFile + attr_reader :filename + attr_reader :language + + MAX_LINES = 10 + + def initialize(json) + @json = json + @filename = @json["filename"] + @language = @json["language"] + end + + def content + lines.take(MAX_LINES).join("\n") + end + + def truncated? + lines.size > MAX_LINES + end + + private + + def lines + @lines ||= @json["content"].split("\n") + end + end + end + end +end diff --git a/engine/github_issue_onebox.rb b/engine/github_issue_onebox.rb new file mode 100644 index 00000000..145df704 --- /dev/null +++ b/engine/github_issue_onebox.rb @@ -0,0 +1,69 @@ +# frozen_string_literal: true + +require_relative "../mixins/github_body" + +module Onebox + module Engine + class GithubIssueOnebox + #Author Lidlanca 2014 + include Engine + include LayoutSupport + include JSON + include Onebox::Mixins::GithubBody + + matches_regexp( + %r{^https?://(?:www\.)?(?:(?:\w)+\.)?github\.com/(?.+)/(?.+)/issues/([[:digit:]]+)}, + ) + always_https + + def url + m = match + "https://api.github.com/repos/#{m["org"]}/#{m["repo"]}/issues/#{m["item_id"]}" + end + + private + + def match + @match ||= + @url.match( + %r{^http(?:s)?://(?:www\.)?(?:(?:\w)+\.)?github\.com/(?.+)/(?.+)/(?issues)/(?[\d]+)}, + ) + end + + def i18n + { opened: I18n.t("onebox.github.opened"), closed: I18n.t("onebox.github.closed") } + end + + def data + created_at = Time.parse(raw["created_at"]) + closed_at = Time.parse(raw["closed_at"]) if raw["closed_at"] + body, excerpt = compute_body(raw["body"]) + ulink = URI(link) + + labels = + raw["labels"].map do |l| + { name: Emoji.codes_to_img(Onebox::Helpers.sanitize(l["name"])) } + end + + { + link: @url, + title: raw["title"], + body: body, + excerpt: excerpt, + labels: labels, + user: raw["user"], + created_at: created_at.strftime("%I:%M%p - %d %b %y %Z"), + created_at_date: created_at.strftime("%F"), + created_at_time: created_at.strftime("%T"), + closed_at: closed_at&.strftime("%I:%M%p - %d %b %y %Z"), + closed_at_date: closed_at&.strftime("%F"), + closed_at_time: closed_at&.strftime("%T"), + closed_by: raw["closed_by"], + avatar: "https://avatars1.githubusercontent.com/u/#{raw["user"]["id"]}?v=2&s=96", + domain: "#{ulink.host}/#{ulink.path.split("/")[1]}/#{ulink.path.split("/")[2]}", + i18n: i18n, + } + end + end + end +end diff --git a/engine/github_pull_request_onebox.rb b/engine/github_pull_request_onebox.rb new file mode 100644 index 00000000..6ad192fc --- /dev/null +++ b/engine/github_pull_request_onebox.rb @@ -0,0 +1,105 @@ +# frozen_string_literal: true + +require_relative "../mixins/github_body" + +module Onebox + module Engine + class GithubPullRequestOnebox + include Engine + include LayoutSupport + include JSON + include Onebox::Mixins::GithubBody + + GITHUB_COMMENT_REGEX = /(\r\n)/ + + matches_regexp(%r{^https?://(?:www\.)?(?:(?:\w)+\.)?(github)\.com(?:/)?(?:.)*/pull}) + always_https + + def url + "https://api.github.com/repos/#{match[:owner]}/#{match[:repository]}/pulls/#{match[:number]}" + end + + private + + def match + @match ||= + @url.match(%r{github\.com/(?[^/]+)/(?[^/]+)/pull/(?[^/]+)}) + end + + def data + result = raw.clone + result["link"] = link + + created_at = Time.parse(result["created_at"]) + result["created_at"] = created_at.strftime("%I:%M%p - %d %b %y %Z") + result["created_at_date"] = created_at.strftime("%F") + result["created_at_time"] = created_at.strftime("%T") + + ulink = URI(link) + result["domain"] = "#{ulink.host}/#{ulink.path.split("/")[1]}/#{ulink.path.split("/")[2]}" + + result["body"], result["excerpt"] = compute_body(result["body"]) + + if result["commit"] = load_commit(link) + result["body"], result["excerpt"] = + compute_body(result["commit"]["commit"]["message"].lines[1..].join) + elsif result["comment"] = load_comment(link) + result["body"], result["excerpt"] = compute_body(result["comment"]["body"]) + elsif result["discussion"] = load_review(link) + result["body"], result["excerpt"] = compute_body(result["discussion"]["body"]) + else + result["pr"] = true + end + result["i18n"] = i18n + result["i18n"]["pr_summary"] = I18n.t( + "onebox.github.pr_summary", + { + commits: result["commits"], + changed_files: result["changed_files"], + additions: result["additions"], + deletions: result["deletions"], + }, + ) + + result + end + + def i18n + { + opened: I18n.t("onebox.github.opened"), + commit_by: I18n.t("onebox.github.commit_by"), + comment_by: I18n.t("onebox.github.comment_by"), + review_by: I18n.t("onebox.github.review_by"), + } + end + + def load_commit(link) + if commit_match = link.match(%r{commits/(\h+)}) + load_json( + "https://api.github.com/repos/#{match[:owner]}/#{match[:repository]}/commits/#{commit_match[1]}", + ) + end + end + + def load_comment(link) + if comment_match = link.match(/#issuecomment-(\d+)/) + load_json( + "https://api.github.com/repos/#{match[:owner]}/#{match[:repository]}/issues/comments/#{comment_match[1]}", + ) + end + end + + def load_review(link) + if review_match = link.match(/#discussion_r(\d+)/) + load_json( + "https://api.github.com/repos/#{match[:owner]}/#{match[:repository]}/pulls/comments/#{review_match[1]}", + ) + end + end + + def load_json(url) + ::MultiJson.load(URI.parse(url).open(read_timeout: timeout)) + end + end + end +end diff --git a/engine/gitlab_blob_onebox.rb b/engine/gitlab_blob_onebox.rb new file mode 100644 index 00000000..a3a3ecca --- /dev/null +++ b/engine/gitlab_blob_onebox.rb @@ -0,0 +1,38 @@ +# frozen_string_literal: true + +require_relative "../mixins/git_blob_onebox" + +module Onebox + module Engine + class GitlabBlobOnebox + def self.git_regexp + %r{^https?://(www\.)?gitlab\.com.*/blob/} + end + + def self.onebox_name + "gitlabblob" + end + + include Onebox::Mixins::GitBlobOnebox + + def i18n + { + truncated_file: I18n.t("onebox.gitlab.truncated_file"), + show_original: I18n.t("onebox.gitlab.show_original"), + } + end + + def raw_regexp + %r{gitlab\.com/(?[^/]+)/(?[^/]+)/blob/(?[^/]+)/(?[^#]+)(#(L(?[^-]*)(-L(?.*))?))?}mi + end + + def raw_template(m) + "https://gitlab.com/#{m[:user]}/#{m[:repo]}/raw/#{m[:sha1]}/#{m[:file]}" + end + + def title + Sanitize.fragment(Onebox::Helpers.uri_unencode(link).sub(%r{^https?\://gitlab\.com/}, "")) + end + end + end +end diff --git a/engine/google_calendar_onebox.rb b/engine/google_calendar_onebox.rb new file mode 100644 index 00000000..a7b57b22 --- /dev/null +++ b/engine/google_calendar_onebox.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class GoogleCalendarOnebox + include Engine + + matches_regexp(/^(https?:)?\/\/((www|calendar)\.google\.[\w.]{2,}|goo\.gl)\/calendar\/.+$/) + always_https + requires_iframe_origins "https://calendar.google.com" + + def to_html + url = @url.split("&").first + src = ::Onebox::Helpers.normalize_url_for_output(url) + "" + end + + def placeholder_html + <<-HTML +
+
+
+ +

Google Calendar

+
+
+
+ HTML + end + end + end +end diff --git a/engine/google_docs_onebox.rb b/engine/google_docs_onebox.rb new file mode 100644 index 00000000..03c43673 --- /dev/null +++ b/engine/google_docs_onebox.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class GoogleDocsOnebox + include Engine + include StandardEmbed + include LayoutSupport + + SUPPORTED_ENDPOINTS = %w[spreadsheets document forms presentation] + SHORT_TYPES = { spreadsheets: :sheets, document: :docs, presentation: :slides, forms: :forms } + + matches_regexp( + %r{^(https?:)?//(docs\.google\.com)/(?(#{SUPPORTED_ENDPOINTS.join("|")}))/d/((?[\w-]*)).+$}, + ) + always_https + + private + + def data + og_data = get_opengraph + short_type = SHORT_TYPES[match[:endpoint].to_sym] + + description = + if og_data.description.blank? + "This #{short_type.to_s.chop.capitalize} is private" + else + Onebox::Helpers.truncate(og_data.description, 250) + end + + { + link: link, + title: og_data.title || "Google #{short_type.to_s.capitalize}", + description: description, + type: short_type, + } + end + + def match + @match ||= @url.match(@@matcher) + end + end + end +end diff --git a/engine/google_drive_onebox.rb b/engine/google_drive_onebox.rb new file mode 100644 index 00000000..cb8bbf79 --- /dev/null +++ b/engine/google_drive_onebox.rb @@ -0,0 +1,30 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class GoogleDriveOnebox + include Engine + include StandardEmbed + include LayoutSupport + + matches_regexp(%r{^(https?:)?//(drive\.google\.com)/file/d/(?[\w-]*)/.+$}) + always_https + + protected + + def data + og_data = get_opengraph + title = og_data.title || "Google Drive" + title = "#{og_data.title} (video)" if og_data.type =~ %r{^video[/\.]} + description = og_data.description || "Google Drive file." + + { + link: link, + title: title, + description: Onebox::Helpers.truncate(description, 250), + image: og_data.image, + } + end + end + end +end diff --git a/engine/google_maps_onebox.rb b/engine/google_maps_onebox.rb new file mode 100644 index 00000000..cd3072a9 --- /dev/null +++ b/engine/google_maps_onebox.rb @@ -0,0 +1,201 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class GoogleMapsOnebox + include Engine + + class << self + def ===(other) + if other.kind_of? URI + @@matchers && @@matchers.any? { |m| other.to_s =~ m[:regexp] } + else + super + end + end + + private + + def matches_regexp(key, regexp) + (@@matchers ||= []) << { key: key, regexp: regexp } + end + end + + always_https + requires_iframe_origins("https://maps.google.com", "https://google.com") + + # Matches shortened Google Maps URLs + matches_regexp :short, %r{^(https?:)?//goo\.gl/maps/} + + # Matches URLs for custom-created maps + matches_regexp :custom, + %r"^(?:https?:)?//www\.google(?:\.(?:\w{2,}))+/maps/d/(?:edit|viewer|embed)\?mid=.+$" + + # Matches URLs with streetview data + matches_regexp :streetview, + %r"^(?:https?:)?//www\.google(?:\.(?:\w{2,}))+/maps[^@]+@(?-?[\d.]+),(?-?[\d.]+),(?:\d+)a,(?[\d.]+)y,(?[\d.]+)h,(?[\d.]+)t.+?data=.*?!1s(?[^!]{22})" + + # Matches "normal" Google Maps URLs with arbitrary data + matches_regexp :standard, %r"^(?:https?:)?//www\.google(?:\.(?:\w{2,}))+/maps" + + # Matches URLs for the old Google Maps domain which we occasionally get redirected to + matches_regexp :canonical, %r"^(?:https?:)?//maps\.google(?:\.(?:\w{2,}))+/maps\?" + + def initialize(url, timeout = nil) + super + resolve_url! + rescue Net::HTTPServerException, + Timeout::Error, + Net::HTTPError, + Errno::ECONNREFUSED, + RuntimeError => err + raise ArgumentError, "malformed url or unresolveable: #{err.message}" + end + + def streetview? + !!@streetview + end + + def to_html + "
" + end + + def placeholder_html + ::Onebox::Helpers.map_placeholder_html + end + + private + + def data + { link: url, title: url } + end + + def resolve_url! + @streetview = false + type, match = match_url + + # Resolve shortened URL, if necessary + if type == :short + follow_redirect! + type, match = match_url + end + + # Try to get the old-maps URI, it is far easier to embed. + if type == :standard + retry_count = 10 + while (retry_count -= 1) > 0 + follow_redirect! + type, match = match_url + break if type != :standard + sleep 0.1 + end + end + + case type + when :standard + # Fallback for map URLs that don't resolve into an easily embeddable old-style URI + # Roadmaps use a "z" zoomlevel, satellite maps use "m" the horizontal width in meters + # TODO: tilted satellite maps using "a,y,t" + match = @url.match(/@(?[\d.-]+),(?[\d.-]+),(?\d+)(\.\d+)?(?[mz])/) + raise "unexpected standard url #{@url}" unless match + zoom = match[:mz] == "z" ? match[:zoom] : Math.log2(57280048.0 / match[:zoom].to_f).round + location = "#{match[:lon]},#{match[:lat]}" + url = "https://maps.google.com/maps?ll=#{location}&z=#{zoom}&output=embed&dg=ntvb" + url += "&q=#{$1}" if match = @url.match(%r{/place/([^/\?]+)}) + url += "&cid=#{($1 + $2).to_i(16)}" if @url.match(/!3m1!1s0x(\h{16}):0x(\h{16})/) + @url = url + @placeholder = + "https://maps.googleapis.com/maps/api/staticmap?maptype=roadmap¢er=#{location}&zoom=#{zoom}&size=690x400&sensor=false" + when :custom + url = @url.dup + @url = rewrite_custom_url(url, "embed") + @placeholder = rewrite_custom_url(url, "thumbnail") + @placeholder_height = @placeholder_width = 120 + when :streetview + @streetview = true + panoid = match[:pano] + lon = match[:lon].to_f.to_s + lat = match[:lat].to_f.to_s + heading = match[:heading].to_f.round(4).to_s + pitch = (match[:pitch].to_f / 10.0).round(4).to_s + fov = (match[:zoom].to_f / 100.0).round(4).to_s + zoom = match[:zoom].to_f.round + @url = + "https://www.google.com/maps/embed?pb=!3m2!2sen!4v0!6m8!1m7!1s#{panoid}!2m2!1d#{lon}!2d#{lat}!3f#{heading}!4f#{pitch}!5f#{fov}" + @placeholder = + "https://maps.googleapis.com/maps/api/streetview?size=690x400&location=#{lon},#{lat}&pano=#{panoid}&fov=#{zoom}&heading=#{heading}&pitch=#{pitch}&sensor=false" + when :canonical + query = URI.decode_www_form(uri.query).to_h + if !query.has_key?("ll") + unless query.has_key?("sll") + raise ArgumentError, "canonical url lacks location argument" + end + query["ll"] = query["sll"] + @url += "&ll=#{query["sll"]}" + end + location = query["ll"] + if !query.has_key?("z") + unless query.has_key?("spn") || query.has_key?("sspn") + raise ArgumentError, "canonical url has incomplete query arguments" + end + if !query.has_key?("spn") + query["spn"] = query["sspn"] + @url += "&spn=#{query["sspn"]}" + end + angle = query["spn"].split(",").first.to_f + zoom = (Math.log(690.0 * 360.0 / angle / 256.0) / Math.log(2)).round + else + zoom = query["z"] + end + @url = @url.sub("output=classic", "output=embed") + @placeholder = + "https://maps.googleapis.com/maps/api/staticmap?maptype=roadmap&size=690x400&sensor=false¢er=#{location}&zoom=#{zoom}" + else + raise "unexpected url type #{type.inspect}" + end + end + + def match_url + @@matchers.each do |matcher| + if m = matcher[:regexp].match(@url) + return matcher[:key], m + end + end + raise ArgumentError, "\"#{@url}\" does not match any known pattern" + end + + def rewrite_custom_url(url, target) + uri = URI(url) + uri.path = uri.path.sub(%r{(?<=^/maps/d/)\w+$}, target) + uri.to_s + end + + def follow_redirect! + begin + http = + FinalDestination::HTTP.start( + uri.host, + uri.port, + use_ssl: uri.scheme == "https", + open_timeout: timeout, + read_timeout: timeout, + ) + + response = http.head(uri.path) + unless %w[200 301 302].include?(response.code) + raise "unexpected response code #{response.code}" + end + + @url = response.code == "200" ? uri.to_s : response["Location"] + @uri = URI(@url) + ensure + begin + http.finish + rescue StandardError + nil + end + end + end + end + end +end diff --git a/engine/google_photos_onebox.rb b/engine/google_photos_onebox.rb new file mode 100644 index 00000000..afb8aade --- /dev/null +++ b/engine/google_photos_onebox.rb @@ -0,0 +1,73 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class GooglePhotosOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://(photos)\.(app\.goo\.gl|google\.com)}) + always_https + + def to_html + og = get_opengraph + return video_html(og) if og.video_secure_url + return album_html(og) if og.type == "google_photos:photo_album" + return image_html(og) if og.image + nil + end + + private + + def video_html(og) + escaped_url = ::Onebox::Helpers.normalize_url_for_output(url) + + <<-HTML + + HTML + end + + def album_html(og) + escaped_url = ::Onebox::Helpers.normalize_url_for_output(url) + album_title = og.description.nil? ? og.title : "[#{og.description}] #{og.title}" + + <<-HTML + + HTML + end + + def image_html(og) + escaped_url = ::Onebox::Helpers.normalize_url_for_output(url) + + <<-HTML + + Google Photos + + HTML + end + end + end +end diff --git a/engine/google_play_app_onebox.rb b/engine/google_play_app_onebox.rb new file mode 100644 index 00000000..f03fceee --- /dev/null +++ b/engine/google_play_app_onebox.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class GooglePlayAppOnebox + include Engine + include LayoutSupport + include HTML + + DEFAULTS = { MAX_DESCRIPTION_CHARS: 500 } + + matches_regexp(%r{^https?://play\.(?:(?:\w)+\.)?(google)\.com(?:/)?/store/apps/}) + always_https + + private + + def data + price = + begin + raw.css("meta[itemprop=price]").first["content"] + rescue StandardError + "Free" + end + { + link: link, + title: + raw.css("meta[property='og:title']").first["content"].gsub( + " - Apps on Google Play", + "", + ), + image: + ::Onebox::Helpers.normalize_url_for_output( + raw.css("meta[property='og:image']").first["content"], + ), + description: + raw.css("meta[name=description]").first["content"][ + 0..DEFAULTS[:MAX_DESCRIPTION_CHARS] + ].chop + "...", + price: price == "0" ? "Free" : price, + } + end + end + end +end diff --git a/engine/hackernews_onebox.rb b/engine/hackernews_onebox.rb new file mode 100644 index 00000000..b4507f26 --- /dev/null +++ b/engine/hackernews_onebox.rb @@ -0,0 +1,50 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class HackernewsOnebox + include Engine + include LayoutSupport + include JSON + + REGEX = %r{^https?://news\.ycombinator\.com/item\?id=(?\d+)} + + matches_regexp(REGEX) + + # This is their official API: https://blog.ycombinator.com/hacker-news-api/ + def url + "https://hacker-news.firebaseio.com/v0/item/#{match[:item_id]}.json" + end + + private + + def match + @match ||= @url.match(REGEX) + end + + def data + return nil unless %w[story comment].include?(raw["type"]) + + html_entities = HTMLEntities.new + data = { + link: @url, + title: Onebox::Helpers.truncate(raw["title"], 80), + favicon: "https://news.ycombinator.com/y18.gif", + timestamp: Time.at(raw["time"]).strftime("%-l:%M %p - %-d %b %Y"), + author: raw["by"], + } + + data["description"] = html_entities.decode( + Onebox::Helpers.truncate(raw["text"], 400), + ) if raw["text"] + + if raw["type"] == "story" + data["data_1"] = raw["score"] + data["data_2"] = raw["descendants"] + end + + data + end + end + end +end diff --git a/engine/html.rb b/engine/html.rb new file mode 100644 index 00000000..b0dfba21 --- /dev/null +++ b/engine/html.rb @@ -0,0 +1,26 @@ +# frozen_string_literal: true + +module Onebox + module Engine + module HTML + private + + # Overwrite for any custom headers + def http_params + {} + end + + def raw + @raw ||= Onebox::Helpers.fetch_html_doc(url, http_params, body_cacher) + end + + def body_cacher + self.options&.[](:body_cacher) + end + + def html? + raw.respond_to(:css) + end + end + end +end diff --git a/engine/image_onebox.rb b/engine/image_onebox.rb new file mode 100644 index 00000000..28d446ca --- /dev/null +++ b/engine/image_onebox.rb @@ -0,0 +1,30 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class ImageOnebox + include Engine + + matches_content_type(%r{^image/(png|jpg|jpeg|gif|bmp|tif|tiff|webp|avif)$}) + matches_regexp(%r{^(https?:)?//.+\.(png|jpg|jpeg|gif|bmp|tif|tiff|webp|avif)(\?.*)?$}i) + + def always_https? + AllowlistedGenericOnebox.host_matches(uri, AllowlistedGenericOnebox.https_hosts) + end + + def to_html + # Fix Dropbox image links + if @url[%r{^https://www.dropbox.com/s/}] + @url.sub!("https://www.dropbox.com", "https://dl.dropboxusercontent.com") + end + + escaped_url = ::Onebox::Helpers.normalize_url_for_output(@url) + <<-HTML + + + + HTML + end + end + end +end diff --git a/engine/imgur_onebox.rb b/engine/imgur_onebox.rb new file mode 100644 index 00000000..c1e6efb2 --- /dev/null +++ b/engine/imgur_onebox.rb @@ -0,0 +1,72 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class ImgurOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://(www\.)?imgur\.com}) + always_https + + def to_html + og = get_opengraph + return video_html(og) if !og.video_secure_url.nil? + return album_html(og) if is_album? + return image_html(og) if !og.image.nil? + nil + end + + private + + def video_html(og) + <<-HTML + + HTML + end + + def album_html(og) + escaped_url = ::Onebox::Helpers.normalize_url_for_output(url) + album_title = "[Album] #{og.title}" + + <<-HTML + + HTML + end + + def is_album? + response = + begin + Onebox::Helpers.fetch_response("https://api.imgur.com/oembed.json?url=#{url}") + rescue StandardError + "{}" + end + oembed_data = ::MultiJson.load(response, symbolize_keys: true) + imgur_data_id = Nokogiri.HTML(oembed_data[:html]).xpath("//blockquote").attr("data-id") + imgur_data_id.to_s[%r{a/}] + end + + def image_html(og) + escaped_url = ::Onebox::Helpers.normalize_url_for_output(url) + + <<-HTML + + Imgur + + HTML + end + end + end +end diff --git a/engine/instagram_onebox.rb b/engine/instagram_onebox.rb new file mode 100644 index 00000000..86fd5594 --- /dev/null +++ b/engine/instagram_onebox.rb @@ -0,0 +1,75 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class InstagramOnebox + include Engine + include StandardEmbed + include LayoutSupport + + matches_regexp( + %r{^https?://(?:www\.)?(?:instagram\.com|instagr\.am)/?(?:.*)/(?:p|tv)/[a-zA-Z\d_-]+}, + ) + always_https + requires_iframe_origins "https://www.instagram.com" + + def clean_url + url + .scan( + %r{^https?://(?:www\.)?(?:instagram\.com|instagr\.am)/?(?:.*)/(?:p|tv)/[a-zA-Z\d_-]+}, + ) + .flatten + .first + end + + def data + @data ||= + begin + oembed = get_oembed + if oembed.data.empty? + raise "No oEmbed data found. Ensure 'facebook_app_access_token' is valid" + end + + { + link: clean_url.gsub("/#{oembed.author_name}/", "/") + "/embed", + title: "@#{oembed.author_name}", + image: oembed.thumbnail_url, + image_width: oembed.data[:thumbnail_width], + image_height: oembed.data[:thumbnail_height], + description: Onebox::Helpers.truncate(oembed.title, 250), + } + end + end + + def placeholder_html + ::Onebox::Helpers.image_placeholder_html + end + + def to_html + <<-HTML + + HTML + end + + protected + + def access_token + (options[:facebook_app_access_token] || Onebox.options.facebook_app_access_token).to_s + end + + def get_oembed_url + if access_token != "" + "https://graph.facebook.com/v9.0/instagram_oembed?url=#{clean_url}&access_token=#{access_token}" + else + # The following is officially deprecated by Instagram, but works in some limited circumstances. + "https://api.instagram.com/oembed/?url=#{clean_url}" + end + end + end + end +end diff --git a/engine/json.rb b/engine/json.rb new file mode 100644 index 00000000..204b09c7 --- /dev/null +++ b/engine/json.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +module Onebox + module Engine + module JSON + private + + def raw + @raw ||= ::MultiJson.load(URI.parse(url).open(read_timeout: timeout)) + end + end + end +end diff --git a/engine/kaltura_onebox.rb b/engine/kaltura_onebox.rb new file mode 100644 index 00000000..f9ace113 --- /dev/null +++ b/engine/kaltura_onebox.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class KalturaOnebox + include Engine + include StandardEmbed + + always_https + matches_regexp(%r{^https?://[a-z0-9]+\.kaltura\.com/id/[a-zA-Z0-9]+}) + requires_iframe_origins "https://*.kaltura.com" + + def preview_html + og = get_opengraph + + <<~HTML + + HTML + end + + def to_html + og = get_opengraph + + <<~HTML + + HTML + end + end + end +end diff --git a/engine/mixcloud_onebox.rb b/engine/mixcloud_onebox.rb new file mode 100644 index 00000000..1a681d5e --- /dev/null +++ b/engine/mixcloud_onebox.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class MixcloudOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://www\.mixcloud\.com/}) + always_https + requires_iframe_origins "https://www.mixcloud.com" + + def placeholder_html + oembed = get_oembed + + <<-HTML + + HTML + end + + def to_html + get_oembed.html + end + end + end +end diff --git a/engine/motoko_onebox.rb b/engine/motoko_onebox.rb new file mode 100644 index 00000000..9a0243bf --- /dev/null +++ b/engine/motoko_onebox.rb @@ -0,0 +1,28 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class MotokoOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://embed\.(motoko|smartcontracts)\.org/?.*}) + requires_iframe_origins("https://embed.motoko.org", "https://embed.smartcontracts.org") + always_https + + def to_html + get_oembed.html + end + + def placeholder_html + ::Onebox::Helpers.generic_placeholder_html + end + + protected + + def get_oembed_url + "https://embed.smartcontracts.org/services/onebox?url=#{url}" + end + end + end +end diff --git a/engine/opengraph_image.rb b/engine/opengraph_image.rb new file mode 100644 index 00000000..77bf8ee2 --- /dev/null +++ b/engine/opengraph_image.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +module Onebox + module Engine + module OpengraphImage + def to_html + og = get_opengraph + "" + end + end + end +end diff --git a/engine/pastebin_onebox.rb b/engine/pastebin_onebox.rb new file mode 100644 index 00000000..c542457f --- /dev/null +++ b/engine/pastebin_onebox.rb @@ -0,0 +1,59 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class PastebinOnebox + include Engine + include LayoutSupport + + MAX_LINES = 10 + + matches_regexp(%r{^http?://pastebin\.com}) + + private + + def data + @data ||= { title: "pastebin.com", link: link, content: content, truncated?: truncated? } + end + + def content + lines.take(MAX_LINES).join("\n") + end + + def truncated? + lines.size > MAX_LINES + end + + def lines + return @lines if defined?(@lines) + response = + begin + Onebox::Helpers.fetch_response( + "http://pastebin.com/raw/#{paste_key}", + redirect_limit: 1, + ) + rescue StandardError + "" + end + @lines = response.split("\n") + end + + def paste_key + regex = + case uri + when %r{/raw/} + %r{/raw/([^/]+)} + when %r{/download/} + %r{/download/([^/]+)} + when %r{/embed/} + %r{/embed/([^/]+)} + else + %r{/([^/]+)} + end + + match = uri.path.match(regex) + match[1] if match && match[1] + end + end + end +end diff --git a/engine/pdf_onebox.rb b/engine/pdf_onebox.rb new file mode 100644 index 00000000..30af7975 --- /dev/null +++ b/engine/pdf_onebox.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class PdfOnebox + include Engine + include LayoutSupport + + matches_regexp(%r{^(https?:)?//.*\.pdf(\?.*)?$}i) + always_https + + private + + def data + begin + size = Onebox::Helpers.fetch_content_length(@url) + rescue StandardError + raise "Unable to read pdf file: #{@url}" + end + + { + link: link, + title: File.basename(uri.path), + filesize: size ? Onebox::Helpers.pretty_filesize(size.to_i) : nil, + } + end + end + end +end diff --git a/engine/pubmed_onebox.rb b/engine/pubmed_onebox.rb new file mode 100644 index 00000000..fe3a39da --- /dev/null +++ b/engine/pubmed_onebox.rb @@ -0,0 +1,61 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class PubmedOnebox + include Engine + include LayoutSupport + + matches_regexp(%r{^https?://(?:(?:\w)+\.)?(www.ncbi.nlm.nih)\.gov(?:/)?/pubmed/\d+}) + + private + + def xml + return @xml if defined?(@xml) + doc = Nokogiri.XML(URI.join(@url, "?report=xml&format=text").open) + pre = doc.xpath("//pre") + @xml = Nokogiri.XML("" + pre.text + "") + end + + def authors + initials = xml.css("Initials").map { |x| x.content } + last_names = xml.css("LastName").map { |x| x.content } + author_list = (initials.zip(last_names)).map { |i, l| i + " " + l } + if author_list.length > 1 + author_list[-2] = author_list[-2] + " and " + author_list[-1] + author_list.pop + end + author_list.join(", ") + end + + def date + xml + .css("PubDate") + .children + .map { |x| x.content } + .select { |s| !s.match(/^\s+$/) } + .map { |s| s.split } + .flatten + .sort + .reverse + .join(" ") # Reverse sort so month before year. + end + + def data + { + title: xml.css("ArticleTitle").text, + authors: authors, + journal: xml.css("Title").text, + abstract: xml.css("AbstractText").text, + date: date, + link: @url, + pmid: match[:pmid], + } + end + + def match + @match ||= @url.match(%r{www\.ncbi\.nlm\.nih\.gov/pubmed/(?[0-9]+)}) + end + end + end +end diff --git a/engine/reddit_media_onebox.rb b/engine/reddit_media_onebox.rb new file mode 100644 index 00000000..5f7450fe --- /dev/null +++ b/engine/reddit_media_onebox.rb @@ -0,0 +1,54 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class RedditMediaOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://(www\.)?reddit\.com}) + + def to_html + if raw[:type] == "image" + <<-HTML + + HTML + elsif raw[:type] =~ %r{^video[/\.]} + <<-HTML + + HTML + else + html = Onebox::Engine::AllowlistedGenericOnebox.new(@url, @timeout).to_html + html.presence + end + end + end + end +end diff --git a/engine/replit_onebox.rb b/engine/replit_onebox.rb new file mode 100644 index 00000000..55c49fea --- /dev/null +++ b/engine/replit_onebox.rb @@ -0,0 +1,25 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class ReplitOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://(replit\.com|repl\.it)/.+}) + always_https + + def placeholder_html + oembed = get_oembed + + <<-HTML + + HTML + end + + def to_html + get_oembed.html + end + end + end +end diff --git a/engine/simplecast_onebox.rb b/engine/simplecast_onebox.rb new file mode 100644 index 00000000..10e9afe5 --- /dev/null +++ b/engine/simplecast_onebox.rb @@ -0,0 +1,30 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class SimplecastOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{https?://(.+)?simplecast.com/(episodes|s)/.*}) + always_https + requires_iframe_origins("https://player.simplecast.com") + + def to_html + get_oembed.html + end + + def placeholder_html + oembed = get_oembed + return if oembed.thumbnail_url.blank? + "" + end + + private + + def get_oembed_url + "https://api.simplecast.com/oembed?url=#{url}" + end + end + end +end diff --git a/engine/sketch_fab_onebox.rb b/engine/sketch_fab_onebox.rb new file mode 100644 index 00000000..b946f327 --- /dev/null +++ b/engine/sketch_fab_onebox.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class SketchFabOnebox + include Engine + include StandardEmbed + + matches_regexp( + /^https?:\/\/sketchfab\.com\/(?:models\/|3d-models\/(?:[^\/\s]+-)?)([a-z0-9]{32})/, + ) + always_https + requires_iframe_origins("https://sketchfab.com") + + def to_html + og = get_opengraph + src = og.video_url.gsub("autostart=1", "") + + <<-HTML + + HTML + end + + def placeholder_html + "" + end + end + end +end diff --git a/engine/slides_onebox.rb b/engine/slides_onebox.rb new file mode 100644 index 00000000..3681b2fa --- /dev/null +++ b/engine/slides_onebox.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class SlidesOnebox + include Engine + include StandardEmbed + + matches_regexp(/^https?:\/\/slides\.com\/[\p{Alnum}_\-]+\/[\p{Alnum}_\-]+$/) + requires_iframe_origins "https://slides.com" + + def to_html + <<-HTML + + HTML + end + + def placeholder_html + escaped_src = ::Onebox::Helpers.normalize_url_for_output(raw[:image]) + "" + end + end + end +end diff --git a/engine/sound_cloud_onebox.rb b/engine/sound_cloud_onebox.rb new file mode 100644 index 00000000..48db722d --- /dev/null +++ b/engine/sound_cloud_onebox.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class SoundCloudOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://soundcloud\.com}) + requires_iframe_origins "https://w.soundcloud.com" + always_https + + def to_html + oembed = get_oembed + oembed.html.gsub("visual=true", "visual=false") + end + + def placeholder_html + oembed = get_oembed + return if oembed.thumbnail_url.blank? + "" + end + + protected + + def get_oembed_url + oembed_url = "https://soundcloud.com/oembed.json?url=#{url}" + oembed_url += "&maxheight=166" unless url["/sets/"] + oembed_url + end + end + end +end diff --git a/engine/stack_exchange_onebox.rb b/engine/stack_exchange_onebox.rb new file mode 100644 index 00000000..c64400d0 --- /dev/null +++ b/engine/stack_exchange_onebox.rb @@ -0,0 +1,66 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class StackExchangeOnebox + include Engine + include LayoutSupport + include JSON + + def self.domains + %w[ + stackexchange.com + stackoverflow.com + superuser.com + serverfault.com + askubuntu.com + stackapps.com + mathoverflow.net + ].map { |domain| Regexp.escape(domain) } + end + + matches_regexp( + %r{^https?://(?:(?:(?\w*)\.)?(?\w*)\.)?(?#{domains.join("|")})/((?:questions|q)/(?\d*)(/.*/(?\d*))?|(a/(?\d*)))}, + ) + + def always_https? + uri.host.split(".").length <= 3 + end + + private + + def match + @match ||= @url.match(@@matcher) + end + + def url + domain = uri.host + question_id = match[:question_id] + answer_id = match[:answer_id2] || match[:answer_id1] + + if answer_id + "https://api.stackexchange.com/2.2/answers/#{answer_id}?site=#{domain}&filter=!.FjueITQdx6-Rq3Ue9PWG.QZ2WNdW" + else + "https://api.stackexchange.com/2.2/questions/#{question_id}?site=#{domain}&filter=!5-duuxrJa-iw9oVvOA(JNimB5VIisYwZgwcfNI" + end + end + + def data + return @data if defined?(@data) + + result = raw["items"][0] + if result + result["creation_date"] = Time.at(result["creation_date"].to_i).strftime( + "%I:%M%p - %d %b %y %Z", + ) + + result["tags"] = result["tags"].take(4).join(", ") + result["is_answer"] = result.key?("answer_id") + result["is_question"] = result.key?("question_id") + end + + @data = result + end + end + end +end diff --git a/engine/standard_embed.rb b/engine/standard_embed.rb new file mode 100644 index 00000000..c21ca818 --- /dev/null +++ b/engine/standard_embed.rb @@ -0,0 +1,203 @@ +# frozen_string_literal: true + +require "cgi" +require "onebox/normalizer" +require "onebox/open_graph" +require "onebox/oembed" +require "onebox/json_ld" + +module Onebox + module Engine + module StandardEmbed + def self.oembed_providers + @@oembed_providers ||= {} + end + + def self.add_oembed_provider(regexp, endpoint) + oembed_providers[regexp] = endpoint + end + + def self.opengraph_providers + @@opengraph_providers ||= [] + end + + def self.add_opengraph_provider(regexp) + opengraph_providers << regexp + end + + # Some oembed providers (like meetup.com) don't provide links to themselves + add_oembed_provider(%r{www\.meetup\.com/}, "http://api.meetup.com/oembed") + add_oembed_provider(%r{www\.mixcloud\.com/}, "https://www.mixcloud.com/oembed/") + # In order to support Private Videos + add_oembed_provider(%r{vimeo\.com/}, "https://vimeo.com/api/oembed.json") + # NYT requires login so use oembed only + add_oembed_provider(%r{nytimes\.com/}, "https://www.nytimes.com/svc/oembed/json/") + + def always_https? + AllowlistedGenericOnebox.host_matches(uri, AllowlistedGenericOnebox.https_hosts) || super + end + + def raw + return @raw if defined?(@raw) + + @raw = {} + + set_opengraph_data_on_raw + set_twitter_data_on_raw + set_oembed_data_on_raw + set_json_ld_data_on_raw + set_favicon_data_on_raw + set_description_on_raw + + @raw + end + + protected + + def html_doc + return @html_doc if defined?(@html_doc) + + headers = nil + headers = { "Cookie" => options[:cookie] } if options[:cookie] + + @html_doc = Onebox::Helpers.fetch_html_doc(url, headers) + end + + def get_oembed + @oembed ||= Onebox::Oembed.new(get_json_response) + end + + def get_opengraph + @opengraph ||= ::Onebox::OpenGraph.new(html_doc) + end + + def get_twitter + return {} unless html_doc + + twitter = {} + + html_doc + .css("meta") + .each do |m| + if (m["property"] && m["property"][/^twitter:(.+)$/i]) || + (m["name"] && m["name"][/^twitter:(.+)$/i]) + value = (m["content"] || m["value"]).to_s + twitter[$1.tr("-:", "_").to_sym] ||= value if (value.present? && value != "0 minutes") + end + end + + twitter + end + + def get_favicon + return nil unless html_doc + + favicon = + html_doc.css( + 'link[rel="shortcut icon"], link[rel="icon shortcut"], link[rel="shortcut"], link[rel="icon"]', + ).first + favicon = favicon.nil? ? nil : (favicon["href"].nil? ? nil : favicon["href"].strip) + + return nil if favicon.blank? + + absolute_url = Onebox::Helpers.get_absolute_image_url(favicon, url) + + return nil if absolute_url.length > UrlHelper::MAX_URL_LENGTH + + absolute_url + end + + def get_description + return nil unless html_doc + + description = html_doc.at("meta[name='description']").to_h["content"] + description ||= html_doc.at("meta[name='Description']").to_h["content"] + + description + end + + def get_json_response + oembed_url = get_oembed_url + + return "{}" if oembed_url.blank? + + begin + Onebox::Helpers.fetch_response(oembed_url) + rescue StandardError + "{}" + end + rescue Errno::ECONNREFUSED, Net::HTTPError, Net::HTTPFatalError, MultiJson::LoadError + "{}" + end + + def get_oembed_url + oembed_url = nil + + StandardEmbed.oembed_providers.each do |regexp, endpoint| + if url =~ regexp + oembed_url = "#{endpoint}?url=#{url}" + break + end + end + + if html_doc + if oembed_url.blank? + application_json = html_doc.at("//link[@type='application/json+oembed']/@href") + oembed_url = application_json.value if application_json + end + + if oembed_url.blank? + text_json = html_doc.at("//link[@type='text/json+oembed']/@href") + oembed_url ||= text_json.value if text_json + end + end + + oembed_url + end + + def get_json_ld + @json_ld ||= Onebox::JsonLd.new(html_doc) + end + + def set_from_normalizer_data(normalizer) + normalizer.data.each do |k, _| + v = normalizer.send(k) + @raw[k] ||= v unless v.nil? + end + end + + def set_opengraph_data_on_raw + og = get_opengraph + set_from_normalizer_data(og) + @raw.except!(:title_attr) + end + + def set_twitter_data_on_raw + twitter = get_twitter + twitter.each { |k, v| @raw[k] ||= v if v.present? } + end + + def set_oembed_data_on_raw + oembed = get_oembed + set_from_normalizer_data(oembed) + end + + def set_json_ld_data_on_raw + json_ld = get_json_ld + set_from_normalizer_data(json_ld) + end + + def set_favicon_data_on_raw + favicon = get_favicon + @raw[:favicon] = favicon if favicon.present? + end + + def set_description_on_raw + unless @raw[:description] + description = get_description + @raw[:description] = description if description.present? + end + end + end + end +end diff --git a/engine/steam_store_onebox.rb b/engine/steam_store_onebox.rb new file mode 100644 index 00000000..2642bf0e --- /dev/null +++ b/engine/steam_store_onebox.rb @@ -0,0 +1,41 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class SteamStoreOnebox + include Engine + include StandardEmbed + + always_https + matches_regexp(%r{^https?://store\.steampowered\.com/app/\d+}) + requires_iframe_origins "https://store.steampowered.com" + + def placeholder_html + og = get_opengraph + <<-HTML +
+
+

#{og.title}

+ +

#{og.description}

+
+
+ HTML + end + + def to_html + iframe_url = @url[%r{https?://store\.steampowered\.com/app/\d+}].gsub("/app/", "/widget/") + escaped_src = ::Onebox::Helpers.normalize_url_for_output(iframe_url) + + <<-HTML + + HTML + end + end + end +end diff --git a/engine/threads_status_onebox.rb b/engine/threads_status_onebox.rb new file mode 100644 index 00000000..1b835ce8 --- /dev/null +++ b/engine/threads_status_onebox.rb @@ -0,0 +1,91 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class ThreadsStatusOnebox + include Engine + include LayoutSupport + include HTML + + matches_regexp(%r{^https?://www\.threads\.net/t/(?[\d\w_-]+)/?.*?$}) + always_https + + def self.priority + 1 + end + + private + + def link + raw.css("link[rel='canonical']").first["href"] + end + + def likes + @og[:description].split(" ").first + end + + def replies + @og[:description].split(", ").drop(1).join(", ").split(" repl").first + end + + def description + text = @og[:description].split(". ").drop(1).join(". ") + linkify_mentions(text) + end + + def title + @og[:title].split(" (@").first + end + + def screen_name + @og[:title].split(" (@").drop(1).join(" (@").split(") on Threads")[0] + end + + def avatar + poster_response = + begin + Onebox::Helpers.fetch_response("https://www.threads.net/@#{screen_name}") + rescue StandardError + return nil + end + poster_html = Nokogiri.HTML(poster_response) + poster_data = ::Onebox::OpenGraph.new(poster_html).data + poster_data[:image] + end + + def image + @og[:image] + end + + def favicon + raw.css("link[rel='icon']").first["href"] + end + + def linkify_mentions(text) + text.gsub(/@([\w\d]+)/, "@\\1") + end + + def data + @og = ::Onebox::OpenGraph.new(raw).data + + @data ||= { + favicon: favicon, + link: link, + description: description, + image: image, + title: title, + screen_name: screen_name, + avatar: avatar, + likes: likes, + replies: replies, + } + + # if the image is the same as the avatar, don't show it + # means it's a thread with no image + @data[:image] = nil if @data[:image].split("?").first == @data[:avatar].split("?").first + + @data + end + end + end +end diff --git a/engine/tiktok_onebox.rb b/engine/tiktok_onebox.rb new file mode 100644 index 00000000..d015479c --- /dev/null +++ b/engine/tiktok_onebox.rb @@ -0,0 +1,60 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class TiktokOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://((?:m|www)\.)?tiktok\.com(?:/@(.+)\/video/|/v/)\d+(/\w+)?/?}) + requires_iframe_origins "https://www.tiktok.com" + always_https + + def placeholder_html + <<-HTML + + HTML + end + + def to_html + video_height = oembed_data.thumbnail_height < 1024 ? 998 : oembed_data.thumbnail_height + height = (323.0 / 576) * video_height + + <<-HTML + + HTML + end + + private + + def oembed_data + @oembed_data = get_oembed + end + + def get_oembed_url + "https://www.tiktok.com/oembed?url=#{url}" + end + end + end +end diff --git a/engine/trello_onebox.rb b/engine/trello_onebox.rb new file mode 100644 index 00000000..810ff588 --- /dev/null +++ b/engine/trello_onebox.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class TrelloOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https://trello\.com/[bc]/\W*}) + requires_iframe_origins "https://trello.com" + always_https + + def to_html + src = "https://trello.com/#{match[:type]}/#{match[:key]}.html" + height = match[:type] == "b" ? 400 : 200 + + <<-HTML + + HTML + end + + def placeholder_html + ::Onebox::Helpers.generic_placeholder_html + end + + private + + def match + return @match if defined?(@match) + @match = @url.match(%{trello\.com/(?[^/]+)/(?[^/]+)/?\W*}) + end + end + end +end diff --git a/engine/twitch_clips_onebox.rb b/engine/twitch_clips_onebox.rb new file mode 100644 index 00000000..022a4e50 --- /dev/null +++ b/engine/twitch_clips_onebox.rb @@ -0,0 +1,20 @@ +# frozen_string_literal: true + +require_relative "../mixins/twitch_onebox" + +class Onebox::Engine::TwitchClipsOnebox + def self.twitch_regexp + %r{^https?://clips\.twitch\.tv/([a-zA-Z0-9_]+/?[^#\?/]+)} + end + + include Onebox::Mixins::TwitchOnebox + requires_iframe_origins "https://clips.twitch.tv" + + def query_params + "clip=#{twitch_id}" + end + + def base_url + "clips.twitch.tv/embed?" + end +end diff --git a/engine/twitch_stream_onebox.rb b/engine/twitch_stream_onebox.rb new file mode 100644 index 00000000..ca339aea --- /dev/null +++ b/engine/twitch_stream_onebox.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +require_relative "../mixins/twitch_onebox" + +class Onebox::Engine::TwitchStreamOnebox + def self.twitch_regexp + /^https?:\/\/(?:www\.|go\.)?twitch\.tv\/(?!directory)([a-zA-Z0-9_]{4,25})$/ + end + + include Onebox::Mixins::TwitchOnebox + + def query_params + "channel=#{twitch_id}" + end +end diff --git a/engine/twitch_video_onebox.rb b/engine/twitch_video_onebox.rb new file mode 100644 index 00000000..9312c6bd --- /dev/null +++ b/engine/twitch_video_onebox.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +require_relative "../mixins/twitch_onebox" + +class Onebox::Engine::TwitchVideoOnebox + def self.twitch_regexp + %r{^https?://(?:www\.)?twitch\.tv/videos/([0-9]+)} + end + + include Onebox::Mixins::TwitchOnebox + + def query_params + "video=v#{twitch_id}" + end +end diff --git a/engine/twitter_status_onebox.rb b/engine/twitter_status_onebox.rb new file mode 100644 index 00000000..706751b4 --- /dev/null +++ b/engine/twitter_status_onebox.rb @@ -0,0 +1,234 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class TwitterStatusOnebox + include Engine + include LayoutSupport + include HTML + include ActionView::Helpers::NumberHelper + + matches_regexp( + %r{^https?://(mobile\.|www\.)?twitter\.com/.+?/status(es)?/\d+(/(video|photo)/\d?+)?+(/?\?.*)?/?$}, + ) + always_https + + def http_params + { "User-Agent" => "DiscourseBot/1.0" } + end + + def to_html + raw.present? ? super : "" + end + + private + + def get_twitter_data + response = + begin + Onebox::Helpers.fetch_response(url, headers: http_params) + rescue StandardError + return nil + end + html = Nokogiri.HTML(response) + twitter_data = {} + html + .css("meta") + .each do |m| + if m.attribute("property") && m.attribute("property").to_s.match(/^og:/i) + m_content = m.attribute("content").to_s.strip + m_property = m.attribute("property").to_s.gsub("og:", "").gsub(":", "_") + twitter_data[m_property.to_sym] = m_content + end + end + twitter_data + end + + def match + @match ||= @url.match(%r{twitter\.com/.+?/status(es)?/(?\d+)}) + end + + def twitter_data + @twitter_data ||= get_twitter_data + end + + def guess_tweet_index + usernames = meta_tags_data("additionalName").compact + usernames.each_with_index do |username, index| + return index if twitter_data[:url].to_s.include?(username) + end + end + + def tweet_index + @tweet_index ||= guess_tweet_index + end + + def client + Onebox.options.twitter_client + end + + def twitter_api_credentials_present? + client && !client.twitter_credentials_missing? + end + + def symbolize_keys(obj) + case obj + when Array + obj.map { |item| symbolize_keys(item) } + when Hash + obj.each_with_object({}) do |(key, value), result| + result[key.to_sym] = symbolize_keys(value) + end + else + obj + end + end + + def raw + if twitter_api_credentials_present? + @raw ||= symbolize_keys(client.status(match[:id])) + else + super + end + end + + def tweet + if twitter_api_credentials_present? + client.prettify_tweet(raw)&.strip + else + twitter_data[:description].gsub(/“(.+?)”/im) { $1 } if twitter_data[:description] + end + end + + def timestamp + if twitter_api_credentials_present? && (created_at = raw.dig(:data, :created_at)) + date = DateTime.strptime(created_at, "%Y-%m-%dT%H:%M:%S.%L%z") + date.strftime("%-l:%M %p - %-d %b %Y") + end + end + + def title + if twitter_api_credentials_present? + raw.dig(:includes, :users)&.first&.dig(:name) + else + meta_tags_data("givenName")[tweet_index] + end + end + + def screen_name + if twitter_api_credentials_present? + raw.dig(:includes, :users)&.first&.dig(:username) + else + meta_tags_data("additionalName")[tweet_index] + end + end + + def avatar + if twitter_api_credentials_present? + raw.dig(:includes, :users)&.first&.dig(:profile_image_url) + end + end + + def likes + if twitter_api_credentials_present? + prettify_number(raw.dig(:data, :public_metrics, :like_count).to_i) + end + end + + def retweets + if twitter_api_credentials_present? + prettify_number(raw.dig(:data, :public_metrics, :retweet_count).to_i) + end + end + + def is_reply + if twitter_api_credentials_present? + raw.dig(:data, :referenced_tweets)&.any? { |tweet| tweet.dig(:type) == "replied_to" } + end + end + + def quoted_full_name + if twitter_api_credentials_present? && quoted_tweet_author.present? + quoted_tweet_author[:name] + end + end + + def quoted_screen_name + if twitter_api_credentials_present? && quoted_tweet_author.present? + quoted_tweet_author[:username] + end + end + + def quoted_text + quoted_tweet[:text] if twitter_api_credentials_present? && quoted_tweet.present? + end + + def quoted_link + if twitter_api_credentials_present? + "https://twitter.com/#{quoted_screen_name}/status/#{quoted_status_id}" + end + end + + def quoted_status_id + raw.dig(:data, :referenced_tweets)&.find { |ref| ref[:type] == "quoted" }&.dig(:id) + end + + def quoted_tweet + raw.dig(:includes, :tweets)&.find { |tweet| tweet[:id] == quoted_status_id } + end + + def quoted_tweet_author + raw.dig(:includes, :users)&.find { |user| user[:id] == quoted_tweet&.dig(:author_id) } + end + + def prettify_number(count) + if count > 0 + number_to_human( + count, + format: "%n%u", + precision: 2, + units: { + thousand: "K", + million: "M", + billion: "B", + }, + ) + end + end + + def attr_at_css(css_property, attribute_name) + raw.at_css(css_property)&.attr(attribute_name) + end + + def meta_tags_data(attribute_name) + data = [] + raw + .css("meta") + .each do |m| + if m.attribute("itemprop") && m.attribute("itemprop").to_s.strip == attribute_name + data.push(m.attribute("content").to_s.strip) + end + end + data + end + + def data + @data ||= { + link: link, + tweet: tweet, + timestamp: timestamp, + title: title, + screen_name: screen_name, + avatar: avatar, + likes: likes, + retweets: retweets, + is_reply: is_reply, + quoted_text: quoted_text, + quoted_full_name: quoted_full_name, + quoted_screen_name: quoted_screen_name, + quoted_link: quoted_link, + } + end + end + end +end diff --git a/engine/typeform_onebox.rb b/engine/typeform_onebox.rb new file mode 100644 index 00000000..e94a8c07 --- /dev/null +++ b/engine/typeform_onebox.rb @@ -0,0 +1,48 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class TypeformOnebox + include Engine + + matches_regexp(%r{^https?://[a-z0-9\-_]+\.typeform\.com/to/[a-zA-Z0-9]+}) + requires_iframe_origins "https://*.typeform.com" + always_https + + def to_html + typeform_src = build_typeform_src + + <<~HTML + + HTML + end + + def placeholder_html + ::Onebox::Helpers.generic_placeholder_html + end + + private + + def build_typeform_src + escaped_src = ::Onebox::Helpers.normalize_url_for_output(@url) + query_params = CGI.parse(URI.parse(escaped_src).query || "") + + return escaped_src if query_params.has_key?("typeform-embed") + + if query_params.empty? + escaped_src += "?" unless escaped_src.end_with?("?") + else + escaped_src += "&" + end + + escaped_src += "typeform-embed=embed-widget" + end + end + end +end diff --git a/engine/video_onebox.rb b/engine/video_onebox.rb new file mode 100644 index 00000000..b0483bfb --- /dev/null +++ b/engine/video_onebox.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class VideoOnebox + include Engine + + matches_regexp(%r{^(https?:)?//.*\.(mov|mp4|webm|ogv)(\?.*)?$}i) + + def always_https? + AllowlistedGenericOnebox.host_matches(uri, AllowlistedGenericOnebox.https_hosts) + end + + def to_html + # Fix Dropbox image links + if @url[%r{^https://www.dropbox.com/s/}] + @url.sub!("https://www.dropbox.com", "https://dl.dropboxusercontent.com") + end + + escaped_url = ::Onebox::Helpers.normalize_url_for_output(@url) + <<-HTML +
+ +
+ HTML + end + + def placeholder_html + SiteSetting.enable_diffhtml_preview ? to_html : ::Onebox::Helpers.video_placeholder_html + end + end + end +end diff --git a/engine/vimeo_onebox.rb b/engine/vimeo_onebox.rb new file mode 100644 index 00000000..a5e1aace --- /dev/null +++ b/engine/vimeo_onebox.rb @@ -0,0 +1,47 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class VimeoOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://(www\.)?vimeo\.com/\d+(/\w+)?/?}) + requires_iframe_origins "https://player.vimeo.com" + always_https + + def placeholder_html + ::Onebox::Helpers.video_placeholder_html + end + + def to_html + video_src = Nokogiri::HTML5.fragment(oembed_data[:html]).at_css("iframe")&.[]("src") + video_src = "https://player.vimeo.com/video/#{oembed_data[:video_id]}" if video_src.blank? + video_src = video_src.gsub("autoplay=1", "").chomp("?") + + <<-HTML + + HTML + end + + private + + def oembed_data + response = Onebox::Helpers.fetch_response("https://vimeo.com/api/oembed.json?url=#{url}") + @oembed_data = ::MultiJson.load(response, symbolize_keys: true) + rescue StandardError + "{}" + end + + def og_data + @og_data = get_opengraph + end + end + end +end diff --git a/engine/wikimedia_onebox.rb b/engine/wikimedia_onebox.rb new file mode 100644 index 00000000..5fc16d7e --- /dev/null +++ b/engine/wikimedia_onebox.rb @@ -0,0 +1,43 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class WikimediaOnebox + include Engine + include LayoutSupport + include JSON + + matches_regexp(%r{^https?://commons\.wikimedia\.org/wiki/(File:.+)}) + always_https + + def self.priority + # Wikimedia links end in an image extension. + # E.g. https://commons.wikimedia.org/wiki/File:Stones_members_montage2.jpg + # This engine should have priority over the generic ImageOnebox. + + 1 + end + + def url + "https://en.wikipedia.org/w/api.php?action=query&titles=#{match[:name]}&prop=imageinfo&iilimit=50&iiprop=timestamp|user|url&iiurlwidth=500&format=json" + end + + private + + def match + @match ||= @url.match(%r{^https?://commons\.wikimedia\.org/wiki/(?File:.+)}) + end + + def data + first_page = raw["query"]["pages"].first[1] + + { + link: first_page["imageinfo"].first["descriptionurl"], + title: first_page["title"], + image: first_page["imageinfo"].first["url"], + thumbnail: first_page["imageinfo"].first["thumburl"], + } + end + end + end +end diff --git a/engine/wikipedia_onebox.rb b/engine/wikipedia_onebox.rb new file mode 100644 index 00000000..1ccb4839 --- /dev/null +++ b/engine/wikipedia_onebox.rb @@ -0,0 +1,110 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class WikipediaOnebox + include Engine + include LayoutSupport + include HTML + + matches_regexp(%r{^https?://.*\.wikipedia\.(com|org)}) + always_https + + private + + def data + paras = [] + text = "" + + # Detect section Hash in the url and retrive the related paragraphs. if no hash provided the first few paragraphs will be used + # Author Lidlanca + # Date 9/8/2014 + if (m_url_hash = @url.match(%r{#([^/?]+)})) # extract url hash + m_url_hash_name = m_url_hash[1] + end + + if m_url_hash.nil? # no hash found in url + paras = raw.search("p") # default get all the paras + else + section_header_title = raw.xpath("//span[@id='#{CGI.unescape(m_url_hash_name)}']") + + if section_header_title.empty? + paras = raw.search("p") # default get all the paras + else + section_title_text = section_header_title.inner_text + section_header = section_header_title[0].parent # parent element of the section span element should be an

node + cur_element = section_header + + # p|text|div covers the general case. We assume presence of at least 1 P node. if section has no P node we may end up with a P node from the next section. + # div tag is commonly used as an assets wraper in an article section. often as the first element holding an image. + # ul support will imporve the output generated for a section with a list as the main content (for example: an Author Bibliography, A musician Discography, etc) + first_p_found = nil + while ( + ((next_sibling = cur_element.next_sibling).name =~ /p|text|div|ul/) || + first_p_found.nil? + ) + # from section header get the next sibling until it is a breaker tag + cur_element = next_sibling + if (cur_element.name == "p" || cur_element.name == "ul") #we treat a list as we detect a p to avoid showing + first_p_found = true + paras.push(cur_element) + end + end + end + end + + unless paras.empty? + cnt = 0 + while text.length < Onebox::LayoutSupport.max_text && cnt <= 3 + break if cnt >= paras.size + text += " " unless cnt == 0 + + if paras[cnt].name == "ul" # Handle UL tag. Generate a textual ordered list (1.item | 2.item | 3.item). Unfortunately no newline allowed in output + li_index = 1 + list_items = [] + paras[cnt] + .children + .css("li") + .each do |li| + list_items.push "#{li_index}." + li.inner_text + li_index += 1 + end + paragraph = (list_items.join " |\n ")[0..Onebox::LayoutSupport.max_text] + else + paragraph = paras[cnt].inner_text[0..Onebox::LayoutSupport.max_text] + end + + paragraph.gsub!(/\[\d+\]/mi, "") + text += paragraph + cnt += 1 + end + end + + text = "#{text[0..Onebox::LayoutSupport.max_text]}..." if text.length > + Onebox::LayoutSupport.max_text + + result = { + link: link, + title: + raw.css("html body h1").inner_text + + (section_title_text ? " | " + section_title_text : ""), #if a section sub title exists add it to the main article title + description: text, + } + + img = raw.css(".image img") + + if img && img.size > 0 + img.each do |i| + src = i["src"] + if src !~ /Question_book/ + result[:image] = src + break + end + end + end + + result + end + end + end +end diff --git a/engine/wistia_onebox.rb b/engine/wistia_onebox.rb new file mode 100644 index 00000000..de5b9c27 --- /dev/null +++ b/engine/wistia_onebox.rb @@ -0,0 +1,48 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class WistiaOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{https?://(.+)?(wistia.com|wi.st)/(medias|embed)/.*}) + requires_iframe_origins("https://fast.wistia.com", "https://fast.wistia.net") + always_https + + def to_html + oembed = get_oembed + extracted_url = oembed.html.match(/iframe\ src\=\"(.*?)\"/) + + if extracted_url + iframe_src = extracted_url[1] + + <<~HTML + + HTML + else + oembed.html + end + end + + def placeholder_html + oembed = get_oembed + return if oembed.thumbnail_url.blank? + "" + end + + private + + def get_oembed_url + "https://fast.wistia.com/oembed?embedType=iframe&url=#{url}" + end + end + end +end diff --git a/engine/xkcd_onebox.rb b/engine/xkcd_onebox.rb new file mode 100644 index 00000000..171e7e5d --- /dev/null +++ b/engine/xkcd_onebox.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class XkcdOnebox + include Engine + include LayoutSupport + include JSON + + matches_regexp(%r{^https?://(www\.)?(m\.)?xkcd\.com/\d+}) + + def url + "https://xkcd.com/#{match[:comic_id]}/info.0.json" + end + + private + + def match + @match ||= @url.match(%{xkcd\.com/(?\\d+)}) + end + + def data + { link: @url, title: raw["safe_title"], image: raw["img"], description: raw["alt"] } + end + end + end +end diff --git a/engine/youku_onebox.rb b/engine/youku_onebox.rb new file mode 100644 index 00000000..7b59fac6 --- /dev/null +++ b/engine/youku_onebox.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class YoukuOnebox + include Engine + include HTML + + matches_regexp(%r{^(https?://)?([\da-z\.-]+)(youku.com/)(.)+/?$}) + requires_iframe_origins "https://player.youku.com" + + # Try to get the video ID. Works for URLs of the form: + # * http://v.youku.com/v_show/id_XNjM3MzAxNzc2.html + # * http://v.youku.com/v_show/id_XMTQ5MjgyMjMyOA==.html?from=y1.3-tech-index3-232-10183.89969-89963.3-1 + def video_id + match = uri.path.match(%r{/v_show/id_([a-zA-Z0-9_=\-]+)(\.html)?.*}) + match && match[1] + rescue StandardError + nil + end + + def to_html + <<~HTML + + HTML + end + end + end +end diff --git a/engine/youtube_onebox.rb b/engine/youtube_onebox.rb new file mode 100644 index 00000000..0b6839d0 --- /dev/null +++ b/engine/youtube_onebox.rb @@ -0,0 +1,186 @@ +# frozen_string_literal: true + +module Onebox + module Engine + class YoutubeOnebox + include Engine + include StandardEmbed + + matches_regexp(%r{^https?://(?:www\.)?(?:m\.)?(?:youtube\.com|youtu\.be)/.+$}) + requires_iframe_origins "https://www.youtube.com" + always_https + + WIDTH ||= 480 + HEIGHT ||= 360 + + def parse_embed_response + return unless video_id + return @parse_embed_response if defined?(@parse_embed_response) + + embed_url = "https://www.youtube.com/embed/#{video_id}" + @embed_doc ||= Onebox::Helpers.fetch_html_doc(embed_url) + + begin + script_tag = + @embed_doc.xpath("//script").find { |tag| tag.to_s.include?("ytcfg.set") }.to_s + match = script_tag.to_s.match(/ytcfg\.set\((?.*)\)/) + + yt_json = ::JSON.parse(match[:json]) + renderer = + ::JSON.parse(yt_json["PLAYER_VARS"]["embedded_player_response"])["embedPreview"][ + "thumbnailPreviewRenderer" + ] + + title = renderer["title"]["runs"].first["text"] + + image = "https://img.youtube.com/vi/#{video_id}/hqdefault.jpg" + rescue StandardError + return + end + + @parse_embed_response = { image: image, title: title } + end + + def placeholder_html + if video_id || list_id + result = parse_embed_response + result ||= get_opengraph.data + + "" + else + to_html + end + end + + def to_html + if video_id + <<-HTML + + HTML + elsif list_id + <<-HTML + + HTML + else + # for channel pages + html = Onebox::Engine::AllowlistedGenericOnebox.new(@url, @timeout).to_html + return if html.blank? + html.gsub!(%r{['"]//}, "https://") + html + end + end + + def video_title + @video_title ||= + begin + result = parse_embed_response || get_opengraph.data + result[:title] + end + end + + private + + def video_id + @video_id ||= + begin + id = nil + + # http://youtu.be/afyK1HSFfgw + id = uri.path[%r{/([\w\-]+)}, 1] if uri.host["youtu.be"] + + # https://www.youtube.com/embed/vsF0K3Ou1v0 + id ||= uri.path[%r{/embed/([\w\-]+)}, 1] if uri.path["/embed/"] + + # https://www.youtube.com/shorts/wi2jAtpBl0Y + id ||= uri.path[%r{/shorts/([\w\-]+)}, 1] if uri.path["/shorts/"] + + # https://www.youtube.com/watch?v=Z0UISCEe52Y + id ||= params["v"] + + sanitize_yt_id(id) + end + end + + def list_id + @list_id ||= sanitize_yt_id(params["list"]) + end + + def sanitize_yt_id(raw) + raw&.match?(/\A[\w-]+\z/) ? raw : nil + end + + def embed_params + p = { "feature" => "oembed", "wmode" => "opaque" } + + p["list"] = list_id if list_id + + # Parse timestrings, and assign the result as a start= parameter + start = + if params["start"] + params["start"] + elsif params["t"] + params["t"] + elsif uri.fragment && uri.fragment.start_with?("t=") + # referencing uri is safe here because any throws were already caught by video_id returning nil + # remove the t= from the start + uri.fragment[2..-1] + end + + p["start"] = parse_timestring(start) if start + p["end"] = parse_timestring params["end"] if params["end"] + + # Official workaround for looping videos + # https://developers.google.com/youtube/player_parameters#loop + # use params.include? so that you can just add "&loop" + if params.include?("loop") + p["loop"] = 1 + p["playlist"] = video_id + end + + # https://developers.google.com/youtube/player_parameters#rel + p["rel"] = 0 if params.include?("rel") + + # https://developers.google.com/youtube/player_parameters#enablejsapi + p["enablejsapi"] = params["enablejsapi"] if params.include?("enablejsapi") + + URI.encode_www_form(p) + end + + def parse_timestring(string) + ($1.to_i * 3600) + ($2.to_i * 60) + $3.to_i if string =~ /(\d+h)?(\d+m)?(\d+s?)?/ + end + + def params + return {} unless uri.query + # This mapping is necessary because CGI.parse returns a hash of keys to arrays. + # And *that* is necessary because querystrings support arrays, so they + # force you to deal with it to avoid security issues that would pop up + # if one day it suddenly gave you an array. + # + # However, we aren't interested. Just take the first one. + @params ||= + begin + p = {} + CGI.parse(uri.query).each { |k, v| p[k] = v.first } + p + end + rescue StandardError + {} + end + end + end +end diff --git a/file_type_finder.rb b/file_type_finder.rb new file mode 100644 index 00000000..5d1ad772 --- /dev/null +++ b/file_type_finder.rb @@ -0,0 +1,64 @@ +# frozen_string_literal: true + +module Onebox + module FileTypeFinder + # In general, most of file extension names would be recognized + # by Highlights.js. However, some need to be checked in other + # ways, either because they just aren't included, because they + # are extensionless, or because they contain dots (they are + # multi-part). + # IMPORTANT: to prevent false positive matching, start all + # entries on this list with a "." + # + # For easy reference, keep these sorted in alphabetical order. + @long_file_types = { + ".bib" => "tex", + ".html.hbs" => "handlebars", + ".html.handlebars" => "handlebars", + ".latex" => "tex", + ".ru" => "rb", + ".simplecov" => "rb", # Not official, but seems commonly found + ".sty" => "tex", + } + + # Some extensionless files for which we know the type + # These should all be stored LOWERCASE, just for consistency. + # The ones that I know of also include the ".lock" fake extension. + # + # For easy reference, keep these sorted in alphabetical order, + # FIRST by their types and THEN by their names. + @extensionless_files = { + "cmake.in" => "cmake", + "gruntfile" => "js", + "gulpfile" => "js", + "artisan" => "php", + "berksfile" => "rb", + "capfile" => "rb", + "cheffile" => "rb", + "cheffile.lock" => "rb", + "gemfile" => "rb", + "guardfile" => "rb", + "rakefile" => "rb", + "thorfile" => "rb", + "vagrantfile" => "rb", + "boxfile" => "yaml", # Not currently (2014-11) in Highlight.js + } + + def self.from_file_name(file_name) + lower_name = file_name.downcase + # First check against the known lists of "special" files and extensions. + return @extensionless_files[lower_name] if @extensionless_files.has_key?(lower_name) + + @long_file_types.each { |extension, type| return type if lower_name.end_with?(extension) } + + # Otherwise, just split on the last ".", + # but add one so we don't return the "." itself. + dot_spot = lower_name.rindex(".") + return lower_name[(dot_spot + 1)..-1] if dot_spot + + # If we couldn't figure it out from the name, + # let the highlighter figure it out from the content. + "" + end + end +end diff --git a/helpers.rb b/helpers.rb new file mode 100644 index 00000000..3d2ee701 --- /dev/null +++ b/helpers.rb @@ -0,0 +1,279 @@ +# frozen_string_literal: true + +require "addressable" + +module Onebox + module Helpers + class DownloadTooLarge < StandardError + end + + IGNORE_CANONICAL_DOMAINS ||= %w[www.instagram.com medium.com youtube.com] + + def self.clean(html) + html.gsub(/<[^>]+>/, " ").gsub(/\n/, "") + end + + def self.fetch_html_doc(url, headers = nil, body_cacher = nil) + response = + ( + begin + fetch_response(url, headers: headers, body_cacher: body_cacher) + rescue StandardError + nil + end + ) + doc = Nokogiri.HTML(response) + uri = Addressable::URI.parse(url) + + ignore_canonical_tag = doc.at('meta[property="og:ignore_canonical"]') + should_ignore_canonical = + IGNORE_CANONICAL_DOMAINS.map { |hostname| uri.hostname.match?(hostname) }.any? + + if !(ignore_canonical_tag && ignore_canonical_tag["content"].to_s == "true") && + !should_ignore_canonical + # prefer canonical link + canonical_link = doc.at('//link[@rel="canonical"]/@href') + canonical_uri = Addressable::URI.parse(canonical_link) + if canonical_link && canonical_uri && + "#{canonical_uri.host}#{canonical_uri.path}" != "#{uri.host}#{uri.path}" + uri = + FinalDestination.new( + canonical_link, + Oneboxer.get_final_destination_options(canonical_link), + ).resolve + if uri.present? + response = + ( + begin + fetch_response(uri.to_s, headers: headers, body_cacher: body_cacher) + rescue StandardError + nil + end + ) + doc = Nokogiri.HTML(response) if response + end + end + end + + doc + end + + def self.fetch_response( + location, + redirect_limit: 5, + domain: nil, + headers: nil, + body_cacher: nil + ) + redirect_limit = Onebox.options.redirect_limit if redirect_limit > + Onebox.options.redirect_limit + + raise Net::HTTPError.new("HTTP redirect too deep", location) if redirect_limit == 0 + + uri = Addressable::URI.parse(location) + uri = Addressable::URI.join(domain, uri) if !uri.host + + use_body_cacher = body_cacher && body_cacher.respond_to?("fetch_cached_response_body") + if use_body_cacher + response_body = body_cacher.fetch_cached_response_body(uri.to_s) + + return response_body if response_body.present? + end + + result = StringIO.new + FinalDestination::HTTP.start( + uri.host, + uri.port, + open_timeout: Onebox.options.connect_timeout, + use_ssl: uri.normalized_scheme == "https", + ) do |http| + http.read_timeout = Onebox.options.timeout + http.verify_mode = OpenSSL::SSL::VERIFY_NONE # Work around path building bugs + + headers ||= {} + + if Onebox.options.user_agent && !headers["User-Agent"] + headers["User-Agent"] = Onebox.options.user_agent + end + + request = Net::HTTP::Get.new(uri.request_uri, headers) + start_time = Time.now + + size_bytes = Onebox.options.max_download_kb * 1024 + http.request(request) do |response| + if cookie = response.get_fields("set-cookie") + # HACK: If this breaks again in the future, use HTTP::CookieJar from gem 'http-cookie' + # See test: it "does not send cookies to the wrong domain" + redir_header = { "Cookie" => cookie.join("; ") } + end + + redir_header = nil unless redir_header.is_a? Hash + + code = response.code.to_i + unless code === 200 + response.error! unless [301, 302, 303, 307, 308].include?(code) + + return( + fetch_response( + response["location"], + redirect_limit: redirect_limit - 1, + domain: "#{uri.scheme}://#{uri.host}", + headers: redir_header, + ) + ) + end + + response.read_body do |chunk| + result.write(chunk) + raise DownloadTooLarge.new if result.size > size_bytes + raise Timeout::Error.new if (Time.now - start_time) > Onebox.options.timeout + end + + if use_body_cacher && body_cacher.cache_response_body?(uri) + body_cacher.cache_response_body(uri.to_s, result.string) + end + + return result.string + end + end + end + + def self.fetch_content_length(location) + uri = URI(location) + + FinalDestination::HTTP.start( + uri.host, + uri.port, + open_timeout: Onebox.options.connect_timeout, + use_ssl: uri.is_a?(URI::HTTPS), + ) do |http| + http.read_timeout = Onebox.options.timeout + if uri.is_a?(URI::HTTPS) + http.use_ssl = true + http.verify_mode = OpenSSL::SSL::VERIFY_NONE + end + + http.request_head([uri.path, uri.query].join("?")) do |response| + return response.code.to_i == 200 ? response.content_length.presence : nil + end + end + end + + def self.pretty_filesize(size) + conv = %w[B KB MB GB TB PB EB] + scale = 1024 + + ndx = 1 + return "#{(size)} #{conv[ndx - 1]}" if (size < 2 * (scale**ndx)) + size = size.to_f + [2, 3, 4, 5, 6, 7].each do |i| + return "#{"%.2f" % (size / (scale**(i - 1)))} #{conv[i - 1]}" if (size < 2 * (scale**i)) + end + ndx = 7 + "#{"%.2f" % (size / (scale**(ndx - 1)))} #{conv[ndx - 1]}" + end + + def self.click_to_scroll_div(width = 690, height = 400) + "
" + end + + def self.truncate(string, length = 50) + return string if string.nil? + string.size > length ? string[0...(string.rindex(" ", length) || length)] + "..." : string + end + + def self.get(meta, attr) + (meta && meta[attr].present?) ? sanitize(meta[attr]) : nil + end + + def self.sanitize(value, length = 50) + return nil if value.blank? + Sanitize.fragment(value).strip + end + + def self.normalize_url_for_output(url) + return "" unless url + url = url.dup + # expect properly encoded url, remove any unsafe chars + url.gsub!(" ", "%20") + url.gsub!("'", "'") + url.gsub!('"', """) + url.gsub!(/[^\w\-`.~:\/?#\[\]@!$&'\(\)*+,;=%\p{M}’]/, "") + + parsed = Addressable::URI.parse(url) + return "" unless parsed.host + + url + end + + def self.get_absolute_image_url(src, url) + begin + URI.parse(url).merge(src).to_s + rescue ArgumentError, URI::BadURIError, URI::InvalidURIError + src + end + end + + # Percent-encodes a URI string per RFC3986 - https://tools.ietf.org/html/rfc3986 + def self.uri_encode(url) + return "" unless url + + uri = Addressable::URI.parse(url) + + encoded_uri = + Addressable::URI.new( + scheme: + Addressable::URI.encode_component( + uri.scheme, + Addressable::URI::CharacterClasses::SCHEME, + ), + authority: + Addressable::URI.encode_component( + uri.authority, + Addressable::URI::CharacterClasses::AUTHORITY, + ), + path: + Addressable::URI.encode_component( + uri.path, + Addressable::URI::CharacterClasses::PATH + "\\%", + ), + query: + Addressable::URI.encode_component( + uri.query, + "a-zA-Z0-9\\-\\.\\_\\~\\$\\&\\*\\,\\=\\:\\@\\?\\%", + ), + fragment: + Addressable::URI.encode_component( + uri.fragment, + "a-zA-Z0-9\\-\\.\\_\\~\\!\\$\\&\\'\\(\\)\\*\\+\\,\\;\\=\\:\\/\\?\\%", + ), + ) + + encoded_uri.to_s + end + + def self.uri_unencode(url) + Addressable::URI.unencode(url) + end + + def self.image_placeholder_html + "
" + end + + def self.video_placeholder_html + "
" + end + + def self.audio_placeholder_html + "
" + end + + def self.map_placeholder_html + "
" + end + + def self.generic_placeholder_html + "
" + end + end +end diff --git a/json_ld.rb b/json_ld.rb new file mode 100644 index 00000000..1dbc8faf --- /dev/null +++ b/json_ld.rb @@ -0,0 +1,46 @@ +# frozen_string_literal: true + +module Onebox + class JsonLd < Normalizer + # Full schema.org hierarchy can be found here: https://schema.org/docs/full.html + MOVIE_JSON_LD_TYPE = "Movie" + SUPPORTED_TYPES = [MOVIE_JSON_LD_TYPE] + + def initialize(doc) + @data = extract(doc) + end + + private + + def extract(doc) + return {} if doc.blank? + + doc + .css('script[type="application/ld+json"]') + .each do |element| + parsed_json = parse_json(element.text) + + if parsed_json.kind_of?(Array) + parsed_json = parsed_json.detect { |x| SUPPORTED_TYPES.include?(x["@type"]) } + return {} if !parsed_json + end + + case parsed_json["@type"] + when MOVIE_JSON_LD_TYPE + return Onebox::Movie.new(parsed_json).to_h + end + end + + {} + end + + def parse_json(json) + begin + JSON[json] + rescue JSON::ParserError => e + Discourse.warn_exception(e, message: "Error parsing JSON-LD: #{json}") + {} + end + end + end +end diff --git a/layout.rb b/layout.rb new file mode 100644 index 00000000..69c03842 --- /dev/null +++ b/layout.rb @@ -0,0 +1,55 @@ +# frozen_string_literal: true + +require_relative "template_support" + +module Onebox + class Layout < Mustache + include TemplateSupport + + VERSION = "1.0.0" + + attr_reader :record + attr_reader :view + + def initialize(name, record) + @record = record.deep_symbolize_keys + + # Fix any relative paths + if @record[:image] && @record[:image] =~ %r{\A/[^/]} + @record[:image] = "#{uri.scheme}://#{uri.host}/#{@record[:image]}" + end + + @md5 = Digest::MD5.new + @view = View.new(name, @record) + @template_name = "_layout" + @template_path = load_paths.last + end + + def to_html + render(details) + end + + private + + def uri + @uri ||= URI(::Onebox::Helpers.normalize_url_for_output(record[:link])) + end + + def details + { + link: record[:link], + title: record[:title], + favicon: record[:favicon], + domain: record[:domain] || uri.host.to_s.sub(/\Awww\./, ""), + article_published_time: record[:article_published_time], + article_published_time_title: record[:article_published_time_title], + metadata_1_label: record[:metadata_1_label], + metadata_1_value: record[:metadata_1_value], + metadata_2_label: record[:metadata_2_label], + metadata_2_value: record[:metadata_2_value], + subname: view.template_name, + view: view.to_html, + } + end + end +end diff --git a/layout_support.rb b/layout_support.rb new file mode 100644 index 00000000..aa8f6c9b --- /dev/null +++ b/layout_support.rb @@ -0,0 +1,17 @@ +# frozen_string_literal: true + +module Onebox + module LayoutSupport + def self.max_text + 500 + end + + def layout + @layout ||= Layout.new(self.class.onebox_name, data) + end + + def to_html + layout.to_html + end + end +end diff --git a/matcher.rb b/matcher.rb new file mode 100644 index 00000000..2f4b7fc3 --- /dev/null +++ b/matcher.rb @@ -0,0 +1,37 @@ +# frozen_string_literal: true + +module Onebox + class Matcher + def initialize(url, options = {}) + begin + @uri = URI(url) + rescue URI::InvalidURIError + end + + @options = options + end + + def ordered_engines + @ordered_engines ||= + Engine.engines.sort_by { |e| e.respond_to?(:priority) ? e.priority : 100 } + end + + def oneboxed + return if @uri.nil? + return if @uri.port && !Onebox.options.allowed_ports.include?(@uri.port) + return if @uri.scheme && !Onebox.options.allowed_schemes.include?(@uri.scheme) + + ordered_engines.find do |engine| + ( + engine.respond_to?(:handles_content_type?) && + engine.handles_content_type?(@options[:content_type]) || engine === @uri + ) && has_allowed_iframe_origins?(engine) + end + end + + def has_allowed_iframe_origins?(engine) + allowed_regexes = @options[:allowed_iframe_regexes] || [] + engine.iframe_origins.all? { |o| allowed_regexes.any? { |r| o =~ r } } + end + end +end diff --git a/mixins/git_blob_onebox.rb b/mixins/git_blob_onebox.rb new file mode 100644 index 00000000..6d8b0fd2 --- /dev/null +++ b/mixins/git_blob_onebox.rb @@ -0,0 +1,243 @@ +# frozen_string_literal: true + +module Onebox + module Mixins + module GitBlobOnebox + def self.included(klass) + klass.include(Onebox::Engine) + klass.include(Onebox::LayoutSupport) + klass.matches_regexp(klass.git_regexp) + klass.always_https + klass.include(InstanceMethods) + end + + EXPAND_AFTER = 0b001 + EXPAND_BEFORE = 0b010 + EXPAND_NONE = 0b0 + + DEFAULTS = { + EXPAND_ONE_LINER: EXPAND_AFTER | EXPAND_BEFORE, #set how to expand a one liner. user EXPAND_NONE to disable expand + LINES_BEFORE: 10, + LINES_AFTER: 10, + SHOW_LINE_NUMBER: true, + MAX_LINES: 20, + MAX_CHARS: 5000, + } + + module InstanceMethods + def initialize(url, timeout = nil) + super url, timeout + # merge engine options from global Onebox.options interface + # self.options = Onebox.options["GithubBlobOnebox"] # self.class.name.split("::").last.to_s + # self.options = Onebox.options[self.class.name.split("::").last.to_s] #We can use this a more generic approach. extract the engine class name automatically + + self.options = DEFAULTS + + @selected_lines_array = nil + @selected_one_liner = 0 + @model_file = nil + + # Define constant after merging options set in Onebox.options + # We can define constant automatically. + options.each_pair do |constant_name, value| + constant_name_u = constant_name.to_s.upcase + if constant_name_u == constant_name.to_s + #define a constant if not already defined + unless self.class.const_defined? constant_name_u.to_sym + Onebox::Mixins::GitBlobOnebox.const_set constant_name_u.to_sym, + options[constant_name_u.to_sym] + end + end + end + end + + private + + def calc_range(m, contents_lines_size) + truncated = false + from = /\d+/.match(m[:from]) #get numeric should only match a positive interger + to = /\d+/.match(m[:to]) #get numeric should only match a positive interger + range_provided = !(from.nil? && to.nil?) #true if "from" or "to" provided in URL + from = from.nil? ? 1 : from[0].to_i #if from not provided default to 1st line + to = to.nil? ? -1 : to[0].to_i #if to not provided default to undefiend to be handled later in the logic + + if to === -1 && range_provided #case "from" exists but no valid "to". aka ONE_LINER + one_liner = true + to = from + else + one_liner = false + end + + unless range_provided #case no range provided default to 1..MAX_LINES + from = 1 + to = MAX_LINES + truncated = true if contents_lines_size > MAX_LINES + #we can technically return here + end + + from, to = [from, to].sort #enforce valid range. [from < to] + from = 1 if from > contents_lines_size #if "from" out of TOP bound set to 1st line + to = contents_lines_size if to > contents_lines_size #if "to" is out of TOP bound set to last line. + + if one_liner + @selected_one_liner = from + if EXPAND_ONE_LINER != EXPAND_NONE + if (EXPAND_ONE_LINER & EXPAND_BEFORE != 0) # check if EXPAND_BEFORE flag is on + from = [1, from - LINES_BEFORE].max # make sure expand before does not go out of bound + end + + if (EXPAND_ONE_LINER & EXPAND_AFTER != 0) # check if EXPAND_FLAG flag is on + to = [to + LINES_AFTER, contents_lines_size].min # make sure expand after does not go out of bound + end + + from = contents_lines_size if from > contents_lines_size #if "from" is out of the content top bound + # to = contents_lines_size if to > contents_lines_size #if "to" is out of the content top bound + else + #no expand show the one liner solely + end + end + + if to - from > MAX_LINES && !one_liner #if exceed the MAX_LINES limit correct unless range was produced by one_liner which it expand setting will allow exceeding the line limit + truncated = true + to = from + MAX_LINES - 1 + end + + { + from: from, #calculated from + from_minus_one: from - 1, #used for getting currect ol>li numbering with css used in template + to: to, #calculated to + one_liner: one_liner, #boolean if a one-liner + selected_one_liner: @selected_one_liner, #if a one liner is provided we create a reference for it. + range_provided: range_provided, #boolean if range provided + truncated: truncated, + } + end + + #minimize/compact leading indentation while preserving overall indentation + def removeLeadingIndentation(str) + min_space = 100 + a_lines = str.lines + a_lines.each do |l| + l = l.chomp("\n") # remove new line + m = l.match(/\A[ ]*/) # find leading spaces 0 or more + if m.nil? || l.size == m[0].size || l.size == 0 + next # SKIP no match or line is only spaces + else # no match | only spaces in line | empty line + m_str_length = m[0].size + if m_str_length <= 1 # minimum space is 1 or nothing we can break we found our minimum + min_space = m_str_length + break #stop iteration + end + min_space = m_str_length if m_str_length < min_space + end + end + a_lines.each do |l| + re = Regexp.new "^[ ]{#{min_space}}" #match the minimum spaces of the line + l.gsub!(re, "") + end + a_lines.join + end + + def line_number_helper(lines, start, selected) + lines = removeLeadingIndentation(lines.join).lines # A little ineffeicent we could modify removeLeadingIndentation to accept array and return array, but for now it is only working with a string + hash_builder = [] + output_builder = [] + lines.map.with_index do |line, i| + lnum = (i.to_i + start) + hash_builder.push( + line_number: lnum, + data: line.gsub("\n", ""), + selected: (selected == lnum) ? true : false, + ) + output_builder.push "#{lnum}: #{line}" + end + { output: output_builder.join(), array: hash_builder } + end + + def raw + return @raw if defined?(@raw) + + m = @url.match(self.raw_regexp) + + if m + from = /\d+/.match(m[:from]) #get numeric should only match a positive interger + to = /\d+/.match(m[:to]) #get numeric should only match a positive interger + + @file = m[:file] + @lang = Onebox::FileTypeFinder.from_file_name(m[:file]) + + if @lang == "stl" && link.match?(%r{\Ahttps?://(www\.)?github\.com.*/blob/}) + @model_file = @lang.dup + @raw = "https://render.githubusercontent.com/view/solid?url=" + self.raw_template(m) + else + contents = URI.parse(self.raw_template(m)).open(read_timeout: timeout).read + + if contents.encoding == Encoding::BINARY || contents.bytes.include?(0) + @raw = nil + @binary = true + return + end + + contents_lines = contents.lines #get contents lines + contents_lines_size = contents_lines.size #get number of lines + + cr = calc_range(m, contents_lines_size) #calculate the range of lines for output + selected_one_liner = cr[:selected_one_liner] #if url is a one-liner calc_range will return it + from = cr[:from] + to = cr[:to] + @truncated = cr[:truncated] + range_provided = cr[:range_provided] + @cr_results = cr + + if range_provided #if a range provided (single line or more) + if SHOW_LINE_NUMBER + lines_result = + line_number_helper( + contents_lines[(from - 1)..(to - 1)], + from, + selected_one_liner, + ) #print code with prefix line numbers in case range provided + contents = lines_result[:output] + @selected_lines_array = lines_result[:array] + else + contents = contents_lines[(from - 1)..(to - 1)].join() + end + else + contents = contents_lines[(from - 1)..(to - 1)].join() + end + + if contents.length > MAX_CHARS #truncate content chars to limits + contents = contents[0..MAX_CHARS] + @truncated = true + end + + @raw = contents + end + end + end + + def data + @data ||= { + title: title, + link: link, + i18n: i18n, + # IMPORTANT NOTE: All of the other class variables are populated + # as *side effects* of the `raw` method! They must all appear + # AFTER the call to `raw`! Don't get bitten by this like I did! + content: raw, + binary: @binary, + lang: "lang-#{@lang}", + lines: @selected_lines_array, + has_lines: !@selected_lines_array.nil?, + selected_one_liner: @selected_one_liner, + cr_results: @cr_results, + truncated: @truncated, + model_file: @model_file, + width: 480, + height: 360, + } + end + end + end + end +end diff --git a/mixins/github_body.rb b/mixins/github_body.rb new file mode 100644 index 00000000..b14de31a --- /dev/null +++ b/mixins/github_body.rb @@ -0,0 +1,31 @@ +# frozen_string_literal: true + +module Onebox + module Mixins + module GithubBody + def self.included(klass) + klass.include(Onebox::Engine) + klass.include(InstanceMethods) + end + + module InstanceMethods + GITHUB_COMMENT_REGEX = // + MAX_BODY_LENGTH = 80 + + def compute_body(body) + if body + body = body.gsub(GITHUB_COMMENT_REGEX, "").strip + if body.length == 0 + body = nil + elsif body.length > MAX_BODY_LENGTH + excerpt = body[MAX_BODY_LENGTH..body.length].rstrip + body = body[0..MAX_BODY_LENGTH - 1] + end + end + + [body, excerpt] + end + end + end + end +end diff --git a/mixins/twitch_onebox.rb b/mixins/twitch_onebox.rb new file mode 100644 index 00000000..8251bcb1 --- /dev/null +++ b/mixins/twitch_onebox.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +module Onebox + module Mixins + module TwitchOnebox + def self.included(klass) + klass.include(Onebox::Engine) + klass.matches_regexp(klass.twitch_regexp) + klass.requires_iframe_origins "https://player.twitch.tv" + klass.include(InstanceMethods) + end + + module InstanceMethods + def twitch_id + @url.match(self.class.twitch_regexp)[1] + end + + def base_url + "player.twitch.tv/?" + end + + def placeholder_html + ::Onebox::Helpers.video_placeholder_html + end + + def to_html + <<~HTML + + HTML + end + end + end + end +end diff --git a/movie.rb b/movie.rb new file mode 100644 index 00000000..702e3c05 --- /dev/null +++ b/movie.rb @@ -0,0 +1,46 @@ +# frozen_string_literal: true + +module Onebox + class Movie + def initialize(json_ld_data) + @json_ld_data = json_ld_data + end + + def name + @json_ld_data["name"] + end + + def image + @json_ld_data["image"] + end + + def description + @json_ld_data["description"] + end + + def rating + @json_ld_data.dig("aggregateRating", "ratingValue") + end + + def genres + @json_ld_data["genre"] + end + + def duration + return nil unless @json_ld_data["duration"] + + Time.parse(@json_ld_data["duration"]).strftime "%H:%M" + end + + def to_h + { + name: name, + image: image, + description: description, + rating: rating, + genres: genres, + duration: duration, + } + end + end +end diff --git a/normalizer.rb b/normalizer.rb new file mode 100644 index 00000000..2a5e3c2e --- /dev/null +++ b/normalizer.rb @@ -0,0 +1,51 @@ +# frozen_string_literal: true + +module Onebox + class Normalizer + attr_reader :data + + def get(attr, *args) + value = data[attr] + return if value.blank? + return value.map { |v| sanitize_value(v, *args) } if value.is_a?(Array) + sanitize_value(value, *args) + end + + def method_missing(attr, *args, &block) + value = get(attr, *args) + + return nil if value.blank? + + method_name = attr.to_s + if method_name.end_with?(*integer_suffixes) + value.to_i + elsif method_name.end_with?(*url_suffixes) + Onebox::Helpers.normalize_url_for_output(value).presence + else + value + end + end + + private + + def integer_suffixes + %w[width height] + end + + def url_suffixes + %w[url image video] + end + + def html_entities + @html_entities ||= HTMLEntities.new + end + + def sanitize_value(value, length = nil, sanitize = true) + value = html_entities.decode(value) + value = Sanitize.fragment(value) if sanitize + value.strip! + value = Onebox::Helpers.truncate(value, length) if length + value + end + end +end diff --git a/oembed.rb b/oembed.rb new file mode 100644 index 00000000..85d0dae5 --- /dev/null +++ b/oembed.rb @@ -0,0 +1,16 @@ +# frozen_string_literal: true + +module Onebox + class Oembed < OpenGraph + def initialize(response) + @data = ::MultiJson.load(response, symbolize_keys: true) + + # never use oembed from WordPress 4.4 (it's broken) + @data.delete(:html) if @data[:html] && @data[:html]["wp-embedded-content"] + end + + def html + get(:html, nil, false) + end + end +end diff --git a/open_graph.rb b/open_graph.rb new file mode 100644 index 00000000..d2507522 --- /dev/null +++ b/open_graph.rb @@ -0,0 +1,56 @@ +# frozen_string_literal: true + +module Onebox + class OpenGraph < Normalizer + def initialize(doc) + @data = extract(doc) + end + + def title + get(:title, 80) + end + + def title_attr + !title.nil? ? "title='#{title}'" : "" + end + + def secure_image_url + secure_url = URI(get(:image)) + secure_url.scheme = "https" + secure_url.to_s + end + + private + + COLLECTIONS = %i[article_section article_section_color article_tag] + + def extract(doc) + return {} if doc.blank? + + data = {} + + doc + .css("meta") + .each do |m| + if (m["property"] && m["property"][/\A(?:og|article|product):(.+)\z/i]) || + (m["name"] && m["name"][/\A(?:og|article|product):(.+)\z/i]) + value = (m["content"] || m["value"]).to_s + next if value.blank? + key = $1.tr("-:", "_").to_sym + data[key] ||= value + if key.in?(COLLECTIONS) + collection_name = "#{key}s".to_sym + data[collection_name] ||= [] + data[collection_name] << value + end + end + end + + # Attempt to retrieve the title from the meta tag + title_element = doc.at_css("title") + data[:title] ||= title_element.text if title_element && title_element.text.present? + + data + end + end +end diff --git a/preview.rb b/preview.rb new file mode 100644 index 00000000..1e387aac --- /dev/null +++ b/preview.rb @@ -0,0 +1,111 @@ +# frozen_string_literal: true + +module Onebox + class Preview + # see https://bugs.ruby-lang.org/issues/14688 + client_exception = + defined?(Net::HTTPClientException) ? Net::HTTPClientException : Net::HTTPServerException + WEB_EXCEPTIONS ||= [ + client_exception, + OpenURI::HTTPError, + Timeout::Error, + Net::HTTPError, + Errno::ECONNREFUSED, + ] + + def initialize(url, options = Onebox.options) + @url = url + @options = options.dup + + allowed_origins = @options[:allowed_iframe_origins] || Onebox::Engine.all_iframe_origins + @options[:allowed_iframe_regexes] = Engine.origins_to_regexes(allowed_origins) + + @engine_class = Matcher.new(@url, @options).oneboxed + end + + def to_s + return "" unless engine + sanitize process_html engine_html + rescue *WEB_EXCEPTIONS + "" + end + + def placeholder_html + return "" unless engine + sanitize process_html engine.placeholder_html + rescue *WEB_EXCEPTIONS + "" + end + + def errors + return {} unless engine + engine.errors + end + + def data + return {} unless engine + engine.data + end + + def verified_data + return {} unless engine + engine.verified_data + end + + def options + OpenStruct.new(@options) + end + + private + + def engine_html + engine.to_html + end + + def process_html(html) + return "" unless html + + if @options[:max_width] + doc = Nokogiri::HTML5.fragment(html) + if doc + doc + .css("[width]") + .each do |e| + width = e["width"].to_i + + if width > @options[:max_width] + height = e["height"].to_i + if (height > 0) + ratio = (height.to_f / width.to_f) + e["height"] = (@options[:max_width] * ratio).floor + end + e["width"] = @options[:max_width] + end + end + return doc.to_html + end + end + + html + end + + def sanitize(html) + config = @options[:sanitize_config] || SanitizeConfig::ONEBOX + config = config.merge(allowed_iframe_regexes: @options[:allowed_iframe_regexes]) + + Sanitize.fragment(html, config) + end + + def engine + return nil unless @engine_class + return @engine if defined?(@engine) + + @engine = @engine_class.new(@url) + @engine.options = @options + @engine + end + + class InvalidURI < StandardError + end + end +end diff --git a/sanitize_config.rb b/sanitize_config.rb new file mode 100644 index 00000000..77f97ee9 --- /dev/null +++ b/sanitize_config.rb @@ -0,0 +1,111 @@ +# frozen_string_literal: true + +module Onebox + module SanitizeConfig + HTTP_PROTOCOLS ||= ["http", "https", :relative].freeze + + ONEBOX ||= + Sanitize::Config.freeze_config( + Sanitize::Config.merge( + Sanitize::Config::RELAXED, + elements: + Sanitize::Config::RELAXED[:elements] + + %w[audio details embed iframe source video svg path use], + attributes: { + "a" => Sanitize::Config::RELAXED[:attributes]["a"] + %w[target], + "audio" => %w[controls controlslist], + "embed" => %w[height src type width], + "iframe" => %w[ + allowfullscreen + frameborder + height + scrolling + src + width + data-original-href + data-unsanitized-src + ], + "source" => %w[src type], + "video" => %w[ + controls + height + loop + width + autoplay + muted + poster + controlslist + playsinline + ], + "path" => %w[d fill-rule], + "svg" => %w[aria-hidden width height viewbox], + "div" => [:data], # any data-* attributes, + "span" => [:data], # any data-* attributes, + "use" => %w[href], + }, + add_attributes: { + "iframe" => { + "seamless" => "seamless", + "sandbox" => + "allow-same-origin allow-scripts allow-forms allow-popups allow-popups-to-escape-sandbox" \ + " allow-presentation", + }, + }, + transformers: + (Sanitize::Config::RELAXED[:transformers] || []) + + [ + lambda do |env| + next unless env[:node_name] == "a" + a_tag = env[:node] + a_tag["href"] ||= "#" + if a_tag["href"] =~ %r{\A(?:[a-z]+:)?//} + a_tag["rel"] = "nofollow ugc noopener" + else + a_tag.remove_attribute("target") + end + end, + lambda do |env| + next unless env[:node_name] == "iframe" + + iframe = env[:node] + allowed_regexes = env[:config][:allowed_iframe_regexes] || [/.*/] + + allowed = allowed_regexes.any? { |r| iframe["src"] =~ r } + + if !allowed + # add a data attribute with the blocked src. This is not required + # but makes it much easier to troubleshoot onebox issues + iframe["data-unsanitized-src"] = iframe["src"] + iframe.remove_attribute("src") + end + end, + ], + protocols: { + "embed" => { + "src" => HTTP_PROTOCOLS, + }, + "iframe" => { + "src" => HTTP_PROTOCOLS, + }, + "source" => { + "src" => HTTP_PROTOCOLS, + }, + "use" => { + "href" => [:relative], + }, + }, + css: { + properties: Sanitize::Config::RELAXED[:css][:properties] + %w[--aspect-ratio], + }, + ), + ) + + DISCOURSE_ONEBOX ||= + Sanitize::Config.freeze_config( + Sanitize::Config.merge( + ONEBOX, + attributes: Sanitize::Config.merge(ONEBOX[:attributes], "aside" => [:data]), + ), + ) + end +end diff --git a/status_check.rb b/status_check.rb new file mode 100644 index 00000000..b774232f --- /dev/null +++ b/status_check.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true + +module Onebox + class StatusCheck + def initialize(url, options = Onebox.options) + @url = url + @options = options + @status = -1 + end + + def ok? + status > 199 && status < 300 + end + + def status + check if @status == -1 + @status + end + + def human_status + case status + when 0 + :connection_error + when 200..299 + :success + when 400..499 + :client_error + when 500..599 + :server_error + else + :unknown_error + end + end + + private + + def check + status, headers = FinalDestination.new(@url).small_get({}) + @status = status + rescue Timeout::Error, Errno::ECONNREFUSED, Net::HTTPError, SocketError + @status = 0 + end + end +end diff --git a/template_support.rb b/template_support.rb new file mode 100644 index 00000000..4e09d411 --- /dev/null +++ b/template_support.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +module Onebox + module TemplateSupport + def load_paths + Onebox.options.load_paths.select(&method(:template?)) + end + + def template?(path) + File.exist?(File.join(path, "#{template_name}.#{template_extension}")) + end + end +end diff --git a/templates/_layout.mustache b/templates/_layout.mustache index 9075319f..02884822 100644 --- a/templates/_layout.mustache +++ b/templates/_layout.mustache @@ -1,4 +1,4 @@ -

+ {{/comment}} - + {{#discussion}} +

+ {{i18n.review_by}} + + {{user.login}} + {{user.login}} + - {{title}} +

+ {{/discussion}} + + {{#pr}} +

+ {{title}} +

+ {{/pr}} + +
+ {{base.label}}{{head.label}}
+ + {{#pr}} +
+
+ {{i18n.opened}} {{created_at}} +
+ + + + +
+ {{/pr}} diff --git a/templates/gitlabblob.mustache b/templates/gitlabblob.mustache index a24ffdfb..9e3390b4 100644 --- a/templates/gitlabblob.mustache +++ b/templates/gitlabblob.mustache @@ -17,5 +17,5 @@ {{/has_lines}} {{#truncated}} - This file has been truncated. show original + {{i18n.truncated_file}} {{i18n.show_original}} {{/truncated}} diff --git a/templates/hackernews.mustache b/templates/hackernews.mustache new file mode 100644 index 00000000..e4431a26 --- /dev/null +++ b/templates/hackernews.mustache @@ -0,0 +1,18 @@ +

{{title}}

+ +{{#description}} +

{{description}}

+{{/description}} + + +

+ {{#data_1}} + {{data_1}} points — + {{/data_1}} + {{#data_2}} + {{data_2}} comments — + {{/data_2}} + {{author}} — + {{timestamp}} +

+ diff --git a/templates/json_ld_partials/movie.mustache b/templates/json_ld_partials/movie.mustache new file mode 100644 index 00000000..740670a4 --- /dev/null +++ b/templates/json_ld_partials/movie.mustache @@ -0,0 +1,6 @@ +{{#rating}} +

Average Rating: {{rating}}

+{{/rating}} +{{#duration}} +

Duration: {{duration}}

+{{/duration}} diff --git a/templates/preview_error_fragment_onebox.mustache b/templates/preview_error_fragment_onebox.mustache new file mode 100644 index 00000000..9a36e118 --- /dev/null +++ b/templates/preview_error_fragment_onebox.mustache @@ -0,0 +1,4 @@ +
+ {{{error_message}}} +
+
diff --git a/templates/preview_error_onebox.mustache b/templates/preview_error_onebox.mustache new file mode 100644 index 00000000..c51f8a34 --- /dev/null +++ b/templates/preview_error_onebox.mustache @@ -0,0 +1,10 @@ + diff --git a/templates/threadsstatus.mustache b/templates/threadsstatus.mustache new file mode 100644 index 00000000..b39da295 --- /dev/null +++ b/templates/threadsstatus.mustache @@ -0,0 +1,30 @@ +{{#avatar}}{{/avatar}} +

{{title}}

+ + +
+ {{{description}}} + {{#image}} +
+ {{/image}} +
+ +
+ {{#likes}} + + {{/likes}} + + {{#replies}} + + + {{replies}} + + {{/replies}} +
diff --git a/templates/twitterstatus.mustache b/templates/twitterstatus.mustache index 67f1b60c..3c3d8497 100644 --- a/templates/twitterstatus.mustache +++ b/templates/twitterstatus.mustache @@ -1,22 +1,27 @@ {{#avatar}}{{/avatar}} -

{{title}}

+
- {{{tweet}}} - {{#quoted_tweet}} + {{#is_reply}} + + + + {{/is_reply}} + {{{tweet}}} + {{#quoted_text}}

{{quoted_full_name}} @{{quoted_screen_name}}

-
{{quoted_tweet}}
+
{{quoted_text}}
- {{/quoted_tweet}} + {{/quoted_text}}
- {{timestamp}} + {{timestamp}} {{#likes}}