webgalen/lib/webgalien/sitemap.rb

56 lines
1.3 KiB
Ruby

module Webgalien
class Sitemap
def self.start(url:, output:)
puts "Loading #{url}"
visited = Set.new
remains = Set.new
remains << url
root = url
while remains.size > 0
current = remains.to_a[0]
remains.delete(current)
current2, links = Sitemap.get_links(root, current)
visited << current2
remains =
remains + links.to_set - visited - visited.map{|x| x.gsub(/\/$/,'') }
end
result = {
"root" => root,
"pages" => visited.to_a
}
File.write(output, result.to_yaml)
end
def self.get_links(root, url)
links = []
mechanize = Mechanize.new
page = mechanize.get(url)
url2 = page.uri.to_s
page.links.each do |link|
next if ! link.href =~ /^https?:\/\//
begin
link_url = mechanize.resolve(link.href).to_s
print "Found #{url} -> #{link_url} "
if link_url.start_with?(root) then
puts "(ok)".green
links << link_url
else
puts "(out of scope)".red
end
rescue Mechanize::UnsupportedSchemeError
print "Found #{url} -> #{link.href} "
puts "(unsupported scheme)".red
end
end
return url2, links
end
end
end