diff --git a/lib/webgalien.rb b/lib/webgalien.rb index 6d4e705..b6d618b 100644 --- a/lib/webgalien.rb +++ b/lib/webgalien.rb @@ -2,6 +2,7 @@ # Load external dependencies require 'thor' require 'celluloid/current' +require 'mechanize' require 'selenium-webdriver' require 'yaml' require 'thor' @@ -15,6 +16,7 @@ require 'webgalien/actors/work_actor' require 'webgalien/screenshot' require 'webgalien/sitemap' +require 'webgalien/devices' # Load cli require 'webgalien/cli' diff --git a/lib/webgalien/actors/crop_png_actor.rb b/lib/webgalien/actors/crop_png_actor.rb index d8ada18..f7a0480 100644 --- a/lib/webgalien/actors/crop_png_actor.rb +++ b/lib/webgalien/actors/crop_png_actor.rb @@ -4,13 +4,17 @@ module Webgalien class CropPngActor include Celluloid + def initialize(output_path) + @output_path = output_path + end + def perform(work_future) work = work_future.value work.shift! input_path = work.input[:path] bbox = work.input[:bbox] - output_path = File.join(TMP_PREFIX, 'crop-' + work.id + '.png') + output_path = File.join(@output_path, 'crop-' + work.id + '.png') puts "(#{work.id}) cropping capture".green system 'convert ' \ diff --git a/lib/webgalien/actors/screenshot_actor.rb b/lib/webgalien/actors/screenshot_actor.rb index 3462b92..9a6662d 100644 --- a/lib/webgalien/actors/screenshot_actor.rb +++ b/lib/webgalien/actors/screenshot_actor.rb @@ -8,6 +8,10 @@ module Webgalien class ScreenshotActor include Celluloid + def initialize(output_path) + @output_path = output_path + end + def perform(future_work) driver = initialize_selenium_driver @@ -15,16 +19,24 @@ module Webgalien work.shift! url = work.input[:url] - # Start selenium work - # platform_login driver + # Go to wanted page - driver.manage.window.resize_to(1440, 8000) + driver.manage.window.resize_to(1440, 900) puts "(#{work.id}) loading page #{url}" driver.navigate.to url puts "(#{work.id}) waiting DOM stability" wait_dom_stability(driver) + # get page size + element = driver.find_element(:css, "body") + height = element.size.height.to_i + 1 + puts "(#{work.id}) resizing to 1440x#{height}" + driver.manage.window.resize_to(1440, height) + + puts "(#{work.id}) waiting DOM stability" + wait_dom_stability(driver) + puts "(#{work.id}) getting page dimensions" element = driver.find_element(:css, "body") location = element.location @@ -37,9 +49,9 @@ module Webgalien y: 0 # location.y.to_i } - puts "(#{work.id}) saving page" - FileUtils.mkdir_p(TMP_PREFIX) - tmp_path = File.join(TMP_PREFIX, 'capture-' + work.id.to_s + '.png') + tmp_path = File.join(@output_path, 'capture-' + work.id.to_s + '.png') + puts "(#{work.id}) saving page to #{tmp_path}" + FileUtils.mkdir_p(@output_path) driver.save_screenshot tmp_path driver.quit @@ -54,12 +66,22 @@ module Webgalien private def initialize_selenium_driver + client = Selenium::WebDriver::Remote::Http::Default.new + client.read_timeout = 120 + client.open_timeout = 120 + options = Selenium::WebDriver::Chrome::Options.new options.add_argument('--headless') options.add_argument('--disable-gpu') + options.add_argument('--dns-prefetch-disable') options.add_argument("--user-agent=#{USER_AGENT}") - Selenium::WebDriver.for :chrome, options: options + # driver.timeout = 90 # instead of the default 60 + Selenium::WebDriver.for( + :chrome, + options: options, + http_client: client + ) end # wait for DOM structure to be stabilizer for 5 consecutive tries diff --git a/lib/webgalien/cli.rb b/lib/webgalien/cli.rb index e72bc09..49d6a1c 100644 --- a/lib/webgalien/cli.rb +++ b/lib/webgalien/cli.rb @@ -1,4 +1,6 @@ +require 'table_print' + module Webgalien class Cli < Thor class_option :'user-agent', @@ -6,32 +8,57 @@ module Webgalien banner: 'USER-AGENT', type: :string, desc: 'choose user agent (default Mozilla)' + option :output, + aliases: '-o', + banner: 'OUTPUT-FILE', + type: :string, + default: 'sitemap.yml', + desc: 'where sitemap will be produced (default: sitemap.yml)' desc 'sitemap URL FILE', 'crawl site and export sitemap' - def sitemap url, file + def sitemap url + Sitemap.start( + url: url, + output: options['output'] + ) end desc 'screenshot FILE', 'take screenshots for each page' + option :device, + aliases: '-d', + banner: 'DEVICE', + type: :string, + desc: 'set device from "list-devices" (default "desktop")' option :profile, - aliases: '-p', - banner: 'PROFILE', + aliases: '-r', + banner: '[portrait|landscape]', type: :string, - desc: 'choose device profile / resolution (default 1440x900 on desktop pc)' - option :output, + desc: 'choose device orientation (default "portrait")' + option :"output-path", aliases: '-o', - banner: 'OUTPUT-DIRECTORY', + banner: 'OUTPUT-PATH', type: :string, - default: '.', - desc: 'where resulting content will be produced' - - def screenshot file - config = YAML.load File.open(file) - prefixed_urls = config['pages'].map {|u| config['root'] + u } + default: 'cache', + desc: 'directory where resulting content will be produced' + def screenshot sitemap + if not Devices.exist?(options['device']) then + STDERR.puts "ERROR: deviceĀ #{options['device']} does not exist" + exit 1 + end Screenshot.start( - urls: prefixed_urls, - output_path: options['output'] + sitemap: sitemap, + output_path: options['output-path'], + device: options['device'], + orientation: options['orientation'] ) end + + + desc 'list-devices', 'list available profiles' + def list_devices + # from https://mediag.com/news/popular-screen-resolutions-designing-for-all/ + Devices.display_list + end end end diff --git a/lib/webgalien/devices.rb b/lib/webgalien/devices.rb new file mode 100644 index 0000000..e488ba9 --- /dev/null +++ b/lib/webgalien/devices.rb @@ -0,0 +1,56 @@ +require 'csv' + +module Webgalien + class Devices + PROFILES_CSV = <<-MARK + laptop, 1440, 900, computer, "x" + desktop, 1280, 768, computer, "x" + + apple-iphone-x, 375, 812, phone, "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1" + apple-iphone-8plus, 414, 736, phone, "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1" + apple-iphone-8, 375, 667, phone, "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1" + apple-iphone-7plus, 414, 736, phone + apple-iphone-7, 375, 667, phone + apple-iphone-6plus, 414, 736, phone + apple-iphone-6, 375, 667, phone + apple-iphone-5, 320, 568, phone + apple-ipad-pro, 1024, 1366, tablet + apple-ipad, 768, 1024, tablet + apple-air, 768, 1024, tablet + apple-air-2, 768, 1024, tablet + apple-mini, 768, 1024, tablet + apple-mini-2, 768, 1024, tablet + apple-mini-3, 768, 1024, tablet + + samsung-galaxy-s9, 360, 740, phone + samsung-galaxy-s8plus, 360, 740, phone + samsung-galaxy-s8, 360, 740, phone + samsung-galaxy-s7, 360, 640, phone + nexus-6p, 411, 731, phone + MARK + .freeze + + PROFILES = CSV.parse(PROFILES_CSV, skip_blanks: true, quote_char: '"', liberal_parsing: true ) + .map do |model, width, height, type, agent| + { + model: model&.strip, + width: width&.strip, + height: height&.strip, + type: type&.strip, + agent: agent&.strip + } + end.freeze + + def self.display_list + tp PROFILES + end + + def self.exist? profile + require 'pp' + pp PROFILES + exit 1 + return PROFILES.map {|x| x.models }.exist? profile + end + end +end + diff --git a/lib/webgalien/screenshot.rb b/lib/webgalien/screenshot.rb index d58b6c4..914938b 100644 --- a/lib/webgalien/screenshot.rb +++ b/lib/webgalien/screenshot.rb @@ -1,16 +1,19 @@ module Webgalien class Screenshot - def self.start(urls:, output_path:) + def self.start(sitemap:, output_path: , device:, orientation:) + config = YAML.load File.open(sitemap) + urls = config['pages'] + # Start workpools cores = Celluloid.cores screenshot_pool = ScreenshotActor.pool( size: cores, - args: { output_path: output_path } + args: [ output_path ] ) crop_pool = CropPngActor.pool( size: cores, - args: { output_path: output_path } + args: [ output_path ] ) futures = diff --git a/lib/webgalien/sitemap.rb b/lib/webgalien/sitemap.rb index 4dd5a8b..014c59b 100644 --- a/lib/webgalien/sitemap.rb +++ b/lib/webgalien/sitemap.rb @@ -1,7 +1,56 @@ module Webgalien class Sitemap - def self.start(output_path:) + def self.start(url:, output:) + puts "Loading #{url}" + + visited = Set.new + remains = Set.new + remains << url + root = url + + while remains.size > 0 + current = remains.to_a[0] + remains.delete(current) + + current2, links = Sitemap.get_links(root, current) + visited << current2 + + remains = + remains + links.to_set - visited - visited.map{|x| x.gsub(/\/$/,'') } + end + + result = { + "root" => root, + "pages" => visited.to_a + } + File.write(output, result.to_yaml) + end + + + def self.get_links(root, url) + links = [] + mechanize = Mechanize.new + page = mechanize.get(url) + url2 = page.uri.to_s + + page.links.each do |link| + next if ! link.href =~ /^https?:\/\// + begin + link_url = mechanize.resolve(link.href).to_s + print "Found #{url} -> #{link_url} " + if link_url.start_with?(root) then + puts "(ok)".green + links << link_url + else + puts "(out of scope)".red + end + rescue Mechanize::UnsupportedSchemeError + print "Found #{url} -> #{link.href} " + puts "(unsupported scheme)".red + end + end + return url2, links end end end