diff --git a/Gemfile.lock b/Gemfile.lock index ad2526e..3455575 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -4,6 +4,7 @@ PATH epafh (0.1.0) colorize hash_validator + highline json mail (~> 2.6.3) mechanize @@ -19,6 +20,7 @@ GEM domain_name (0.5.25) unf (>= 0.0.5, < 1.0.0) hash_validator (0.4.0) + highline (1.7.8) http-cookie (1.0.2) domain_name (~> 0.5) interception (0.5) diff --git a/bin/epafh b/bin/epafh index ca30a32..d7dcdb8 100755 --- a/bin/epafh +++ b/bin/epafh @@ -15,352 +15,10 @@ require 'thor' require 'json' require 'mail' require 'colorize' +require 'epafh' #Net::IMAP.debug = true -class Hash - #take keys of hash and transform those to a symbols - def self.transform_keys_to_symbols(value) - return value if not value.is_a?(Hash) - hash = value.inject({}) do |memo,(k,v)| - memo[k.to_sym] = Hash.transform_keys_to_symbols(v); memo - end - return hash - end -end -module Epafh - EPAFI_CONFIG_FILE = File.join(ENV['HOME'],'.epafh','config.yml') - EPAFI_IGNORE_FILE = File.join(ENV['HOME'],'.epafh','ignore.yml') - - class ContactManager - - CRM_LOGIN_URL = '/login' - CRM_LEADS_URL = '/leads.json' - CRM_CONTACTS_URL = '/contacts.json' - - - def initialize config - @config = config - - @browser = Mechanize.new { |agent| - agent.user_agent_alias = 'Mac Safari' - } - @ignore_list = Set.new - @keep_list = Set.new - - ## Load configuration file - # - - unless File.exist? EPAFI_CONFIG_FILE then - raise "Unable to find configuration file #{EPAFI_CONFIG_FILE}" - end - @config = config - - - connect! - load_contacts - load_leads - load_ignore - #puts @keep_list.to_a - rescue RuntimeError => e - STDERR.puts e.message - end - - def connect! - @browser.get(@config[:crm][:baseurl] + CRM_LOGIN_URL) do |page| - page.form_with(action: '/authentication') do |f| - f['authentication[username]'] = @config[:crm][:login] - f['authentication[password]'] = @config[:crm][:password] - end.click_button - end - - rescue Mechanize::ResponseCodeError - raise "Authentication error. Verify your credentials." - end - - def load_ignore - if File.exist? EPAFI_IGNORE_FILE - ignore_list = YAML.load_file(EPAFI_IGNORE_FILE) - ignore_list.each do |email| - @ignore_list << email.strip.downcase - end - end - end - - def load_leads page=1 - crm_leads_page = @browser.get(@config[:crm][:baseurl] + CRM_LEADS_URL + "?page=#{page}") - crm_leads = JSON.parse crm_leads_page.body - crm_leads.each do |lead_obj| - keep_contact lead_obj['lead']['email'].split(',') - keep_contact lead_obj['lead']['alt_email'].split(',') - end - - if crm_leads.size > 0 then - load_leads (page + 1) - end - end - - def load_contacts page=1 - crm_contacts_page = @browser.get(@config[:crm][:baseurl] + CRM_CONTACTS_URL + "?page=#{page}") - crm_contacts = JSON.parse crm_contacts_page.body - crm_contacts.each do |contact_obj| - keep_contact contact_obj['contact']['email'].split(',') - keep_contact contact_obj['contact']['alt_email'].split(',') - end - - if crm_contacts.size > 0 then - load_contacts (page + 1) - end - #contacts.to_a.sort.join(', ') - end - - def keep_contact emails - emails = emails.to_a if emails.is_a? Set - [emails].flatten.each do |mail| - @keep_list << mail.strip.downcase - end - end - - def ignore_contact emails - emails = emails.to_a if emails.is_a? Set - [emails].flatten.each do |mail| - @ignore_list << mail.strip.downcase - end - File.open(EPAFI_IGNORE_FILE, 'w') do |f| - f.write @ignore_list.to_a.to_yaml - end - end - - def include? mail - return ( - (@ignore_list.include? mail.strip.downcase) or - (@keep_list.include? mail.strip.downcase) - ) - end - end - - class CrawlerApp - attr_reader :imap - attr_reader :contacts - - TMPMAIL_FILE = '.tmpmail' - - def initialize config - @saved_key = 'RFC822' - @filter_headers = 'BODY[HEADER.FIELDS (FROM TO Subject)]'.upcase - @config = config - @imap = nil - @contact_manager = ContactManager.new config - end - - - def connect! - @imap = Net::IMAP.new( - @config[:imap][:server], - ssl: {verify_mode: OpenSSL::SSL::VERIFY_NONE}, - port: 993 - ) - @imap.login(@config[:imap][:login], @config[:imap][:password]) - #@imap.select(SOURCE_MAILBOX) - end - - def disconnect! - imap.logout - imap.disconnect - end - - MAIL_REGEXP = /\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}\b/ - - def examine_message message - m = Mail.read_from_string message.attr[@saved_key] - return if m.from.nil? - return if m.to.nil? - - - emails = Set.new - begin - emails.merge m.from - emails.merge [m.to].flatten if m.to - emails.merge [m.cc].flatten if m.cc - rescue => e - binding.pry - end - - body_emails = Set.new - m.body.parts.each do |part| - next if part.content_type != 'text/plain' - - #body_emails = m.body.decoded.scan MAIL_REGEXP - part_emails = part.decoded.scan MAIL_REGEXP - #pp body_emails - if not part_emails.empty? then - body_emails.merge part_emails - end - end - emails.merge body_emails - - # puts emails.to_a.join(' , ') - remaining_emails = ( - emails - .map{ |e| [e, (@contact_manager.include? e)] } - .select{ |e,t| !t } - ) - seen_emails = ( - remaining_emails - .empty? - ) - # puts @contacts.to_a.join(', ') - if seen_emails then - print "." - return - else - puts "" - all_addr = { - from: (m.from || []), - to: (m.to || []), - cc: (m.cc || []), - body: (body_emails || []) - } - all_addr.each do |key, list| - list.each do |addr| - addr_str = if remaining_emails.map{|e,t| e}.include? addr then - addr.yellow.on_black - else addr - end - str = "%4s: %s" % [key.to_s.upcase, addr_str] - puts str - end - end - puts "" - #puts " ORIGINAL EMAILS: #{emails.to_a.join(', ')}" - #puts "REMAINING EMAILS: #{remaining_emails.map{|e,t| e}.join(', ')}".yellow.on_black - #puts " SEEN EMAILS: #{seen_emails}" - end - - while true - begin - puts "\n### #{m.subject}" - print "#{m.from.join(',')} --> #{m.to.join(',')} " - puts "[Ignore/Add/Skip/Detail] ?" - - i = STDIN.gets - case i.strip - when /^[iI]$/ then # ignore - @contact_manager.ignore_contact remaining_emails.map{|e,t| e} - break - when /^[aA]$/ then # add - @contact_manager.keep_contact remaining_emails.map{|e,t| e} - break - when /^[sS]$/ then #skip - break - when /^[dD]$/ then # decode - # puts m.body.decoded - File.open(TMPMAIL_FILE + ".2", 'w') do |f| - f.write message.attr[@saved_key] - end - system "formail < #{TMPMAIL_FILE}.2 > #{TMPMAIL_FILE}" - system "mutt -R -f #{TMPMAIL_FILE}" - end - rescue Encoding::ConverterNotFoundError - STDERR.puts "ERROR: encoding problem in email. Unable to convert." - end - end - - return - end - - def examine_all - @imap.list('', '*').each do |mailbox| - puts "\nMAILBOX #{mailbox.name}".yellow - next unless mailbox.name =~ /#{@config[:imap][:pattern]}/ - @imap.examine mailbox.name - - puts "Searching #{mailbox.name}" - messages_in_mailbox = @imap.responses['EXISTS'][0] - if not messages_in_mailbox then - say "#{mailbox.name} does not have any messages" - next - end - - @imap.select mailbox.name #GYR: TEST - ids = @imap.search('SINCE 1-Jan-2001') - # NOT OR TO "@agilefant.org" CC "@agilefant.org"') - if ids.empty? - puts "\tFound no messages" - else - examine_message_list mailbox.name, ids - end - end - end - - def examine_message_list mailbox_name, ids - ids.each do |id| - @imap.select mailbox_name #GYR: TEST - message = imap.fetch(id, [@saved_key])[0] - examine_message message - end - rescue IOError - # re-connect and try again - connect! - retry - end - end - - class Crawler < Thor - - CONFIG_FILE = 'config/secrey.yml' - - include Thor::Actions - default_task :crawl - - - desc 'crawl', 'Crawls email to save mails' - def crawl - #saved_info = [] - parse_configuration - - ## Run application - app = CrawlerApp.new @config - - app.connect! - app.examine_all - #pp saved_info - app.disconnect! - end - - def initialize *args - @config = {} - super - end - - private - - - def parse_configuration - ## Load configuration - @config.merge! Hash.transform_keys_to_symbols( - YAML::load( File.open( EPAFI_CONFIG_FILE ) ) - ) - - ## Validate configuration structure - validations = { - crm: { - baseurl: lambda { |url| url =~ URI::regexp }, - login: 'string', - password: 'string' - }, - imap: { - server: 'string', - login: 'string', - password: 'string' - } - } - validator = HashValidator.validate(@config, validations) - raise "Configuration is not valid: #{validator.errors.inspect}" unless validator.valid? - end - end -end - -Epafh::Crawler.start +Epafh::App.start(ARGV) diff --git a/epafh.gemspec b/epafh.gemspec index ac0f5a9..dcfb14d 100644 --- a/epafh.gemspec +++ b/epafh.gemspec @@ -1,27 +1,19 @@ # coding: utf-8 lib = File.expand_path('../lib', __FILE__) $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) -require 'epafh/version' +require 'epafh/constants' Gem::Specification.new do |spec| spec.name = "epafh" spec.version = Epafh::VERSION - spec.authors = ["@@@No user configured@@@"] - spec.email = ["@@@No user configured@@@"] + spec.authors = ["Glenn Y. Rolland"] + spec.email = ["glenux@glenux.net"] - spec.summary = %q{TODO: Write a short summary, because Rubygems requires one.} - spec.description = %q{TODO: Write a longer description or delete this line.} - spec.homepage = "TODO: Put your gem's website or public repo URL here." + spec.summary = %q{A handy tool to extract emails and URLs from an IMAP account.} + spec.description = %q{A handy tool to extract emails and URLs from an IMAP account.} + spec.homepage = "https://github.com/glenux/epafh" spec.license = "LGPL-3" - # Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or - # delete this section to allow pushing this gem to any host. - if spec.respond_to?(:metadata) - spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'" - else - raise "RubyGems 2.0 or newer is required to protect against public gem pushes." - end - spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) } spec.bindir = "bin" spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } @@ -37,4 +29,6 @@ Gem::Specification.new do |spec| spec.add_runtime_dependency "hash_validator" spec.add_runtime_dependency "pry" spec.add_runtime_dependency "pry-rescue" + spec.add_runtime_dependency "highline" end + diff --git a/lib/epafh.rb b/lib/epafh.rb index aaee149..af9fbbb 100644 --- a/lib/epafh.rb +++ b/lib/epafh.rb @@ -1,5 +1,10 @@ -require "epafh/version" module Epafh - # Your code goes here... end + +require "epafh/hash" +require "epafh/constants" +require "epafh/contact_manager" +require "epafh/crawler" +require "epafh/app" + diff --git a/lib/epafh/app.rb b/lib/epafh/app.rb new file mode 100644 index 0000000..27db256 --- /dev/null +++ b/lib/epafh/app.rb @@ -0,0 +1,54 @@ +class Epafh::App < Thor + + CONFIG_FILE = 'config/secrey.yml' + + include Thor::Actions + default_task :crawl + + + desc 'crawl', 'Crawls email to save mails' + def crawl + #saved_info = [] + parse_configuration + + ## Run application + app = Crawler.new @config + + app.connect! + app.examine_all + #pp saved_info + app.disconnect! + end + + def initialize *args + @config = {} + super + end + + private + + + def parse_configuration + ## Load configuration + @config.merge! Hash.transform_keys_to_symbols( + YAML::load( File.open( Epafh::EPAFI_CONFIG_FILE ) ) + ) + + ## Validate configuration structure + validations = { + crm: { + baseurl: lambda { |url| url =~ URI::regexp }, + login: 'string', + password: 'string' + }, + imap: { + server: 'string', + login: 'string', + password: 'string' + } + } + validator = HashValidator.validate(@config, validations) + raise "Configuration is not valid: #{validator.errors.inspect}" unless validator.valid? + end +end + diff --git a/lib/epafh/constants.rb b/lib/epafh/constants.rb new file mode 100644 index 0000000..de695a4 --- /dev/null +++ b/lib/epafh/constants.rb @@ -0,0 +1,6 @@ +module Epafh + VERSION = "0.1.0" + + EPAFI_CONFIG_FILE = File.join(ENV['HOME'],'.epafh','config.yml') + EPAFI_IGNORE_FILE = File.join(ENV['HOME'],'.epafh','ignore.yml') +end diff --git a/lib/epafh/contact_manager.rb b/lib/epafh/contact_manager.rb new file mode 100644 index 0000000..23b9654 --- /dev/null +++ b/lib/epafh/contact_manager.rb @@ -0,0 +1,106 @@ +class Epafh::ContactManager + + CRM_LOGIN_URL = '/login' + CRM_LEADS_URL = '/leads.json' + CRM_CONTACTS_URL = '/contacts.json' + + + def initialize config + @config = config + + @browser = Mechanize.new { |agent| + agent.user_agent_alias = 'Mac Safari' + } + @ignore_list = Set.new + @keep_list = Set.new + + ## Load configuration file + # + + unless File.exist? EPAFI_CONFIG_FILE then + raise "Unable to find configuration file #{EPAFI_CONFIG_FILE}" + end + @config = config + + + connect! + load_contacts + load_leads + load_ignore + #puts @keep_list.to_a + rescue RuntimeError => e + STDERR.puts e.message + end + + def connect! + @browser.get(@config[:crm][:baseurl] + CRM_LOGIN_URL) do |page| + page.form_with(action: '/authentication') do |f| + f['authentication[username]'] = @config[:crm][:login] + f['authentication[password]'] = @config[:crm][:password] + end.click_button + end + + rescue Mechanize::ResponseCodeError + raise "Authentication error. Verify your credentials." + end + + def load_ignore + if File.exist? EPAFI_IGNORE_FILE + ignore_list = YAML.load_file(EPAFI_IGNORE_FILE) + ignore_list.each do |email| + @ignore_list << email.strip.downcase + end + end + end + + def load_leads page=1 + crm_leads_page = @browser.get(@config[:crm][:baseurl] + CRM_LEADS_URL + "?page=#{page}") + crm_leads = JSON.parse crm_leads_page.body + crm_leads.each do |lead_obj| + keep_contact lead_obj['lead']['email'].split(',') + keep_contact lead_obj['lead']['alt_email'].split(',') + end + + if crm_leads.size > 0 then + load_leads (page + 1) + end + end + + def load_contacts page=1 + crm_contacts_page = @browser.get(@config[:crm][:baseurl] + CRM_CONTACTS_URL + "?page=#{page}") + crm_contacts = JSON.parse crm_contacts_page.body + crm_contacts.each do |contact_obj| + keep_contact contact_obj['contact']['email'].split(',') + keep_contact contact_obj['contact']['alt_email'].split(',') + end + + if crm_contacts.size > 0 then + load_contacts (page + 1) + end + #contacts.to_a.sort.join(', ') + end + + def keep_contact emails + emails = emails.to_a if emails.is_a? Set + [emails].flatten.each do |mail| + @keep_list << mail.strip.downcase + end + end + + def ignore_contact emails + emails = emails.to_a if emails.is_a? Set + [emails].flatten.each do |mail| + @ignore_list << mail.strip.downcase + end + File.open(EPAFI_IGNORE_FILE, 'w') do |f| + f.write @ignore_list.to_a.to_yaml + end + end + + def include? mail + return ( + (@ignore_list.include? mail.strip.downcase) or + (@keep_list.include? mail.strip.downcase) + ) + end +end diff --git a/lib/epafh/crawler.rb b/lib/epafh/crawler.rb new file mode 100644 index 0000000..f7b5d5a --- /dev/null +++ b/lib/epafh/crawler.rb @@ -0,0 +1,164 @@ + +class Epafh::Crawler + attr_reader :imap + attr_reader :contacts + + TMPMAIL_FILE = '.tmpmail' + + def initialize config + @saved_key = 'RFC822' + @filter_headers = 'BODY[HEADER.FIELDS (FROM TO Subject)]'.upcase + @config = config + @imap = nil + @contact_manager = ContactManager.new config + end + + + def connect! + @imap = Net::IMAP.new( + @config[:imap][:server], + ssl: {verify_mode: OpenSSL::SSL::VERIFY_NONE}, + port: 993 + ) + @imap.login(@config[:imap][:login], @config[:imap][:password]) + #@imap.select(SOURCE_MAILBOX) + end + + def disconnect! + imap.logout + imap.disconnect + end + + MAIL_REGEXP = /\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}\b/ + + def examine_message message + m = Mail.read_from_string message.attr[@saved_key] + return if m.from.nil? + return if m.to.nil? + + + emails = Set.new + emails.merge m.from + emails.merge [m.to].flatten if m.to + emails.merge [m.cc].flatten if m.cc + + body_emails = Set.new + m.body.parts.each do |part| + next if part.content_type != 'text/plain' + + #body_emails = m.body.decoded.scan MAIL_REGEXP + part_emails = part.decoded.scan MAIL_REGEXP + #pp body_emails + if not part_emails.empty? then + body_emails.merge part_emails + end + end + emails.merge body_emails + + # puts emails.to_a.join(' , ') + remaining_emails = ( + emails + .map{ |e| [e, (@contact_manager.include? e)] } + .select{ |e,t| !t } + ) + seen_emails = ( + remaining_emails + .empty? + ) + # puts @contacts.to_a.join(', ') + if seen_emails then + print "." + return + else + puts "" + all_addr = { + from: (m.from || []), + to: (m.to || []), + cc: (m.cc || []), + body: (body_emails || []) + } + all_addr.each do |key, list| + list.each do |addr| + addr_str = if remaining_emails.map{|e,t| e}.include? addr then + addr.yellow.on_black + else addr + end + str = "%4s: %s" % [key.to_s.upcase, addr_str] + puts str + end + end + puts "" + #puts " ORIGINAL EMAILS: #{emails.to_a.join(', ')}" + #puts "REMAINING EMAILS: #{remaining_emails.map{|e,t| e}.join(', ')}".yellow.on_black + #puts " SEEN EMAILS: #{seen_emails}" + end + + while true + begin + puts "\n### #{m.subject}" + print "#{m.from.join(',')} --> #{m.to.join(',')} " + puts "[Ignore/Add/Skip/Detail] ?" + + i = STDIN.gets + case i.strip + when /^[iI]$/ then # ignore + @contact_manager.ignore_contact remaining_emails.map{|e,t| e} + break + when /^[aA]$/ then # add + @contact_manager.keep_contact remaining_emails.map{|e,t| e} + break + when /^[sS]$/ then #skip + break + when /^[dD]$/ then # decode + # puts m.body.decoded + File.open(TMPMAIL_FILE + ".2", 'w') do |f| + f.write message.attr[@saved_key] + end + system "formail < #{TMPMAIL_FILE}.2 > #{TMPMAIL_FILE}" + system "mutt -R -f #{TMPMAIL_FILE}" + end + rescue Encoding::ConverterNotFoundError + STDERR.puts "ERROR: encoding problem in email. Unable to convert." + end + end + + return + end + + def examine_all + @imap.list('', '*').each do |mailbox| + puts "\nMAILBOX #{mailbox.name}".yellow + next unless mailbox.name =~ /#{@config[:imap][:pattern]}/ + @imap.examine mailbox.name + + puts "Searching #{mailbox.name}" + messages_in_mailbox = @imap.responses['EXISTS'][0] + if not messages_in_mailbox then + say "#{mailbox.name} does not have any messages" + next + end + + @imap.select mailbox.name #GYR: TEST + ids = @imap.search('SINCE 1-Jan-2001') + # NOT OR TO "@agilefant.org" CC "@agilefant.org"') + if ids.empty? + puts "\tFound no messages" + else + examine_message_list mailbox.name, ids + end + end + end + + def examine_message_list mailbox_name, ids + ids.each do |id| + @imap.select mailbox_name #GYR: TEST + message = imap.fetch(id, [@saved_key])[0] + examine_message message + end + rescue IOError + # re-connect and try again + connect! + retry + end + +end diff --git a/lib/epafh/hash.rb b/lib/epafh/hash.rb new file mode 100644 index 0000000..2e05149 --- /dev/null +++ b/lib/epafh/hash.rb @@ -0,0 +1,11 @@ + +class Hash + #take keys of hash and transform those to a symbols + def self.transform_keys_to_symbols(value) + return value if not value.is_a?(Hash) + hash = value.inject({}) do |memo,(k,v)| + memo[k.to_sym] = Hash.transform_keys_to_symbols(v); memo + end + return hash + end +end diff --git a/lib/epafh/version.rb b/lib/epafh/version.rb deleted file mode 100644 index 8261627..0000000 --- a/lib/epafh/version.rb +++ /dev/null @@ -1,3 +0,0 @@ -module Epafh - VERSION = "0.1.0" -end