#!/usr/bin/env ruby require 'pp' require 'yaml' require 'rubygems' require 'bundler/setup' require 'nokogiri' require 'open-uri' require 'zipruby' config_path = ARGV.shift config_fh = File.open config_path config_yml = YAML.load config_fh pp config_yml class Page attr_reader :url attr_accessor :prev, :next attr_accessor :index def initialize url, config puts "page %s" % url @url = url @config = config @prev = nil @next = nil @first = nil @last = nil @index = 0 @doc = Nokogiri::HTML(open(url)) end def first return @first unless @first.nil? first_url = @doc.xpath @config['first_xpath'] pp first_url.inspect puts "first %s" % first_url.text url = _make_url @url, first_url.text @first = Page.new url, @config return @first end def last return @last unless @last.nil? last_url = @doc.xpath @config['last_xpath'] pp last_url.inspect puts "last %s" % last_url.text url = _make_url @url, last_url.text @last = Page.new url, @config return @last end def next return @next unless @next.nil? next_url = @doc.xpath @config['next_xpath'] pp next_url.inspect puts "next %s" % next_url.text url = _make_url @url, next_url.text @next = Page.new url, @config @next.prev = self return @next end def prev return @prev unless @prev.nil? prev_url = @doc.xpath @config['prev_xpath'] pp prev_url.inspect puts "prev %s" % prev_url.text url = _make_url @url, prev_url.text @prev = Page.new url, @config @prev.next = self return @prev end def image return @image unless @image.nil? image_url = @doc.xpath @config['image_xpath'] url = _make_url @url, image_url.text #pp url url #@prev = Page.new @prev, @config # get image end def _make_url current_str, next_str current_url = URI.parse(current_str) case next_str when /^\// then # semi-absolute next_url = current_url next_url.path = URI.parse(next_str).path when /^https?/ then # absolute next_url = URI.parse(next_str) else # relative next_url = current_url next_url.path += URI.parse(next_str).path end #pp "make_url %s" % next_url.to_s return next_url.to_s end end #FileUtils.mkdir_p config_yml['name'] page = Page.new config_yml['base_url'],config_yml page = page.first ar = Zip::Archive.open( config_yml['name'] + '.zip', Zip::CREATE | Zip::TRUNC) while not page.nil? do puts "PAGE %s" % page.url puts " image = %s" % page.image.inspect image_format = page.image.gsub(/^.*\.(.*?)$/,'\1') image_path = File.join config_yml['name'], ("page_%04d.%s" % [page.index, image_format]) #open( image_path, "wb" ) do |image_fh| # image_fh.write( ) #end ar.add_buffer( image_path, open( page.image ).read ) page_next = page.next break if page_next.url == page.url page_next.index = page.index + 1 page = page.next sleep config_yml['sleep'] end ar.close