Cleanup & restructure Crawler class.
This commit is contained in:
parent
7bfc3beb2a
commit
f07a23833f
3 changed files with 53 additions and 63 deletions
|
@ -49,7 +49,7 @@ class Epafh::App < Thor
|
||||||
parse_configuration
|
parse_configuration
|
||||||
|
|
||||||
## Run application
|
## Run application
|
||||||
app = Crawler.new @config
|
app = Epafh::Crawler.new @config
|
||||||
|
|
||||||
app.connect!
|
app.connect!
|
||||||
app.examine_all
|
app.examine_all
|
||||||
|
|
|
@ -17,8 +17,8 @@ class Epafh::ContactManager
|
||||||
## Load configuration file
|
## Load configuration file
|
||||||
#
|
#
|
||||||
|
|
||||||
unless File.exist? EPAFI_CONFIG_FILE then
|
unless File.exist? Epafh::EPAFI_CONFIG_FILE then
|
||||||
raise "Unable to find configuration file #{EPAFI_CONFIG_FILE}"
|
raise "Unable to find configuration file #{Epafh::EPAFI_CONFIG_FILE}"
|
||||||
end
|
end
|
||||||
@config = config
|
@config = config
|
||||||
|
|
||||||
|
@ -45,8 +45,8 @@ class Epafh::ContactManager
|
||||||
end
|
end
|
||||||
|
|
||||||
def load_ignore
|
def load_ignore
|
||||||
if File.exist? EPAFI_IGNORE_FILE
|
if File.exist? Epafh::EPAFI_IGNORE_FILE
|
||||||
ignore_list = YAML.load_file(EPAFI_IGNORE_FILE)
|
ignore_list = YAML.load_file(Epafh::EPAFI_IGNORE_FILE)
|
||||||
ignore_list.each do |email|
|
ignore_list.each do |email|
|
||||||
@ignore_list << email.strip.downcase
|
@ignore_list << email.strip.downcase
|
||||||
end
|
end
|
||||||
|
@ -92,7 +92,7 @@ class Epafh::ContactManager
|
||||||
[emails].flatten.each do |mail|
|
[emails].flatten.each do |mail|
|
||||||
@ignore_list << mail.strip.downcase
|
@ignore_list << mail.strip.downcase
|
||||||
end
|
end
|
||||||
File.open(EPAFI_IGNORE_FILE, 'w') do |f|
|
File.open(Epafh::EPAFI_IGNORE_FILE, 'w') do |f|
|
||||||
f.write @ignore_list.to_a.to_yaml
|
f.write @ignore_list.to_a.to_yaml
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -4,16 +4,16 @@ class Epafh::Crawler
|
||||||
attr_reader :contacts
|
attr_reader :contacts
|
||||||
|
|
||||||
TMPMAIL_FILE = '.tmpmail'
|
TMPMAIL_FILE = '.tmpmail'
|
||||||
|
MAIL_REGEXP = /\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}\b/
|
||||||
|
|
||||||
def initialize config
|
def initialize config
|
||||||
@saved_key = 'RFC822'
|
@saved_key = 'RFC822'
|
||||||
@filter_headers = 'BODY[HEADER.FIELDS (FROM TO Subject)]'.upcase
|
@filter_headers = 'BODY[HEADER.FIELDS (FROM TO Subject)]'.upcase
|
||||||
@config = config
|
@config = config
|
||||||
@imap = nil
|
@imap = nil
|
||||||
@contact_manager = ContactManager.new config
|
@contact_manager = Epafh::ContactManager.new config
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
def connect!
|
def connect!
|
||||||
@imap = Net::IMAP.new(
|
@imap = Net::IMAP.new(
|
||||||
@config[:imap][:server],
|
@config[:imap][:server],
|
||||||
|
@ -29,83 +29,45 @@ class Epafh::Crawler
|
||||||
imap.disconnect
|
imap.disconnect
|
||||||
end
|
end
|
||||||
|
|
||||||
MAIL_REGEXP = /\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}\b/
|
|
||||||
|
|
||||||
def examine_message message
|
def examine_message message
|
||||||
m = Mail.read_from_string message.attr[@saved_key]
|
m = Mail.read_from_string message.attr[@saved_key]
|
||||||
return if m.from.nil?
|
return if m.from.nil?
|
||||||
return if m.to.nil?
|
return if m.to.nil?
|
||||||
|
|
||||||
|
body_emails = extract_body_mail m.body.parts
|
||||||
|
|
||||||
|
## Create association between extracted addreses and email part
|
||||||
|
mail_struct = {
|
||||||
|
from: [m.from || []].flatten,
|
||||||
|
to: [m.to || []].flatten,
|
||||||
|
cc: [m.cc || []].flatten,
|
||||||
|
body: (body_emails.to_a || [])
|
||||||
|
}
|
||||||
emails = Set.new
|
emails = Set.new
|
||||||
emails.merge m.from
|
mail_struct.each {|key, val| emails.merge val }
|
||||||
emails.merge [m.to].flatten if m.to
|
remaining_emails = emails.reject{|e| @contact_manager.include?(e) }
|
||||||
emails.merge [m.cc].flatten if m.cc
|
|
||||||
|
|
||||||
body_emails = Set.new
|
# Skip examination of no addresses are remaining
|
||||||
m.body.parts.each do |part|
|
if remaining_emails.empty? then
|
||||||
next if part.content_type != 'text/plain'
|
|
||||||
|
|
||||||
#body_emails = m.body.decoded.scan MAIL_REGEXP
|
|
||||||
part_emails = part.decoded.scan MAIL_REGEXP
|
|
||||||
#pp body_emails
|
|
||||||
if not part_emails.empty? then
|
|
||||||
body_emails.merge part_emails
|
|
||||||
end
|
|
||||||
end
|
|
||||||
emails.merge body_emails
|
|
||||||
|
|
||||||
# puts emails.to_a.join(' , ')
|
|
||||||
remaining_emails = (
|
|
||||||
emails
|
|
||||||
.map{ |e| [e, (@contact_manager.include? e)] }
|
|
||||||
.select{ |e,t| !t }
|
|
||||||
)
|
|
||||||
seen_emails = (
|
|
||||||
remaining_emails
|
|
||||||
.empty?
|
|
||||||
)
|
|
||||||
# puts @contacts.to_a.join(', ')
|
|
||||||
if seen_emails then
|
|
||||||
print "."
|
print "."
|
||||||
return
|
return
|
||||||
else
|
|
||||||
puts ""
|
|
||||||
all_addr = {
|
|
||||||
from: (m.from || []),
|
|
||||||
to: (m.to || []),
|
|
||||||
cc: (m.cc || []),
|
|
||||||
body: (body_emails || [])
|
|
||||||
}
|
|
||||||
all_addr.each do |key, list|
|
|
||||||
list.each do |addr|
|
|
||||||
addr_str = if remaining_emails.map{|e,t| e}.include? addr then
|
|
||||||
addr.yellow.on_black
|
|
||||||
else addr
|
|
||||||
end
|
|
||||||
str = "%4s: %s" % [key.to_s.upcase, addr_str]
|
|
||||||
puts str
|
|
||||||
end
|
|
||||||
end
|
|
||||||
puts ""
|
|
||||||
#puts " ORIGINAL EMAILS: #{emails.to_a.join(', ')}"
|
|
||||||
#puts "REMAINING EMAILS: #{remaining_emails.map{|e,t| e}.join(', ')}".yellow.on_black
|
|
||||||
#puts " SEEN EMAILS: #{seen_emails}"
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
display_header mail_struct, remaining_emails
|
||||||
|
|
||||||
while true
|
while true
|
||||||
begin
|
begin
|
||||||
puts "\n### #{m.subject}"
|
puts "\n### #{m.subject}"
|
||||||
print "#{m.from.join(',')} --> #{m.to.join(',')} "
|
print "#{mail_struct[:from].join(',')} --> #{mail_struct[:to].join(',')} "
|
||||||
puts "[Ignore/Add/Skip/Detail] ?"
|
puts "[Ignore/Add/Skip/Detail] ?"
|
||||||
|
|
||||||
i = STDIN.gets
|
i = STDIN.gets
|
||||||
case i.strip
|
case i.strip
|
||||||
when /^[iI]$/ then # ignore
|
when /^[iI]$/ then # ignore
|
||||||
@contact_manager.ignore_contact remaining_emails.map{|e,t| e}
|
@contact_manager.ignore_contact remaining_emails
|
||||||
break
|
break
|
||||||
when /^[aA]$/ then # add
|
when /^[aA]$/ then # add
|
||||||
@contact_manager.keep_contact remaining_emails.map{|e,t| e}
|
@contact_manager.keep_contact remaining_emails
|
||||||
break
|
break
|
||||||
when /^[sS]$/ then #skip
|
when /^[sS]$/ then #skip
|
||||||
break
|
break
|
||||||
|
@ -161,4 +123,32 @@ class Epafh::Crawler
|
||||||
retry
|
retry
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def extract_body_mail body_parts
|
||||||
|
body_emails = Set.new
|
||||||
|
body_parts.each do |part|
|
||||||
|
next if part.content_type != 'text/plain'
|
||||||
|
|
||||||
|
part_emails = part.decoded.scan MAIL_REGEXP
|
||||||
|
if not part_emails.empty? then
|
||||||
|
body_emails.merge part_emails
|
||||||
|
end
|
||||||
|
end
|
||||||
|
body_emails
|
||||||
|
end
|
||||||
|
|
||||||
|
def display_header header_struct, remaining_emails
|
||||||
|
puts ""
|
||||||
|
header_struct.each do |key, list|
|
||||||
|
pp list
|
||||||
|
list.each do |addr|
|
||||||
|
addr_str = if remaining_emails.include? addr then
|
||||||
|
addr.yellow.on_black
|
||||||
|
else addr
|
||||||
|
end
|
||||||
|
str = "%4s: %s" % [key.to_s.upcase, addr_str]
|
||||||
|
puts str
|
||||||
|
end
|
||||||
|
end
|
||||||
|
puts ""
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in a new issue