56 lines
1.3 KiB
Ruby
56 lines
1.3 KiB
Ruby
|
|
module Webgalien
|
|
class Sitemap
|
|
def self.start(url:, output:)
|
|
puts "Loading #{url}"
|
|
|
|
visited = Set.new
|
|
remains = Set.new
|
|
remains << url
|
|
root = url
|
|
|
|
while remains.size > 0
|
|
current = remains.to_a[0]
|
|
remains.delete(current)
|
|
|
|
current2, links = Sitemap.get_links(root, current)
|
|
visited << current2
|
|
|
|
remains =
|
|
remains + links.to_set - visited - visited.map{|x| x.gsub(/\/$/,'') }
|
|
end
|
|
|
|
result = {
|
|
"root" => root,
|
|
"pages" => visited.to_a
|
|
}
|
|
File.write(output, result.to_yaml)
|
|
end
|
|
|
|
|
|
def self.get_links(root, url)
|
|
links = []
|
|
mechanize = Mechanize.new
|
|
page = mechanize.get(url)
|
|
url2 = page.uri.to_s
|
|
|
|
page.links.each do |link|
|
|
next if ! link.href =~ /^https?:\/\//
|
|
begin
|
|
link_url = mechanize.resolve(link.href).to_s
|
|
print "Found #{url} -> #{link_url} "
|
|
if link_url.start_with?(root) then
|
|
puts "(ok)".green
|
|
links << link_url
|
|
else
|
|
puts "(out of scope)".red
|
|
end
|
|
rescue Mechanize::UnsupportedSchemeError
|
|
print "Found #{url} -> #{link.href} "
|
|
puts "(unsupported scheme)".red
|
|
end
|
|
end
|
|
return url2, links
|
|
end
|
|
end
|
|
end
|