comicbox/bin/web2cbz

130 lines
2.8 KiB
Text
Raw Normal View History

2011-11-22 17:42:02 +00:00
#!/usr/bin/env ruby
require 'pp'
require 'yaml'
require 'rubygems'
require 'bundler/setup'
require 'nokogiri'
require 'open-uri'
2011-11-23 14:23:52 +00:00
require 'zipruby'
2011-11-22 17:42:02 +00:00
config_path = ARGV.shift
config_fh = File.open config_path
config_yml = YAML.load config_fh
pp config_yml
class Page
attr_reader :url
2011-11-23 14:23:52 +00:00
attr_accessor :prev, :next
attr_accessor :index
2011-11-22 17:42:02 +00:00
def initialize url, config
puts "page %s" % url
@url = url
@config = config
@prev = nil
@next = nil
@first = nil
@last = nil
2011-11-23 14:23:52 +00:00
@index = 0
2011-11-22 17:42:02 +00:00
@doc = Nokogiri::HTML(open(url))
end
def first
return @first unless @first.nil?
first_url = @doc.xpath @config['first_xpath']
2011-11-23 14:23:52 +00:00
pp first_url.inspect
puts "first %s" % first_url.text
2011-11-22 17:42:02 +00:00
url = _make_url @url, first_url.text
@first = Page.new url, @config
return @first
end
def last
return @last unless @last.nil?
last_url = @doc.xpath @config['last_xpath']
2011-11-23 14:23:52 +00:00
pp last_url.inspect
puts "last %s" % last_url.text
2011-11-22 17:42:02 +00:00
url = _make_url @url, last_url.text
@last = Page.new url, @config
return @last
end
def next
return @next unless @next.nil?
next_url = @doc.xpath @config['next_xpath']
2011-11-23 14:23:52 +00:00
pp next_url.inspect
puts "next %s" % next_url.text
2011-11-22 17:42:02 +00:00
url = _make_url @url, next_url.text
@next = Page.new url, @config
2011-11-23 14:23:52 +00:00
@next.prev = self
2011-11-22 17:42:02 +00:00
return @next
end
def prev
return @prev unless @prev.nil?
prev_url = @doc.xpath @config['prev_xpath']
2011-11-23 14:23:52 +00:00
pp prev_url.inspect
puts "prev %s" % prev_url.text
2011-11-22 17:42:02 +00:00
url = _make_url @url, prev_url.text
@prev = Page.new url, @config
2011-11-23 14:23:52 +00:00
@prev.next = self
2011-11-22 17:42:02 +00:00
return @prev
end
def image
return @image unless @image.nil?
image_url = @doc.xpath @config['image_xpath']
url = _make_url @url, image_url.text
#pp url
url
#@prev = Page.new @prev, @config
# get image
end
def _make_url current_str, next_str
current_url = URI.parse(current_str)
case next_str
when /^\// then
# semi-absolute
next_url = current_url
next_url.path = URI.parse(next_str).path
when /^https?/ then
# absolute
next_url = URI.parse(next_str)
else
# relative
next_url = current_url
next_url.path += URI.parse(next_str).path
end
#pp "make_url %s" % next_url.to_s
return next_url.to_s
end
end
2011-11-23 14:23:52 +00:00
#FileUtils.mkdir_p config_yml['name']
2011-11-22 17:42:02 +00:00
page = Page.new config_yml['base_url'],config_yml
page = page.first
2011-11-23 14:23:52 +00:00
ar = Zip::Archive.open( config_yml['name'] + '.zip', Zip::CREATE | Zip::TRUNC)
2011-11-22 17:42:02 +00:00
while not page.nil? do
puts "PAGE %s" % page.url
puts " image = %s" % page.image.inspect
2011-11-23 14:23:52 +00:00
image_format = page.image.gsub(/^.*\.(.*?)$/,'\1')
image_path = File.join config_yml['name'], ("page_%04d.%s" % [page.index, image_format])
#open( image_path, "wb" ) do |image_fh|
# image_fh.write( )
#end
ar.add_buffer( image_path, open( page.image ).read )
page_next = page.next
break if page_next.url == page.url
page_next.index = page.index + 1
2011-11-22 17:42:02 +00:00
page = page.next
sleep config_yml['sleep']
end
2011-11-23 14:23:52 +00:00
ar.close