From 466c31cd91e5576405af190f76c073e1d06a4482 Mon Sep 17 00:00:00 2001 From: Silvio Rhatto Date: Thu, 18 Sep 2014 18:11:07 -0300 Subject: Sorting scripts --- eml2mbox/eml2mbox.rb | 265 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100755 eml2mbox/eml2mbox.rb (limited to 'eml2mbox/eml2mbox.rb') diff --git a/eml2mbox/eml2mbox.rb b/eml2mbox/eml2mbox.rb new file mode 100755 index 0000000..1fc7bca --- /dev/null +++ b/eml2mbox/eml2mbox.rb @@ -0,0 +1,265 @@ +#!/usr/bin/ruby +#============================================================================================# +# eml2mbox.rb v0.08 # +# Last updated: Jan 23, 2004 # +# # +# Converts a bunch of eml files into one mbox file. # +# # +# Usage: [ruby] eml2mbx.rb [-c] [-l] [-s] [-yz] [emlpath [trgtmbx]] # +# Switches: # +# -c Remove CRs (^M) appearing at end of lines (Unix) # +# -l Remove LFs appearing at beggining of lines (old Mac) - not tested # +# -s Don't use standard mbox postmark formatting (for From_ line) # +# This will force the use of original From and Date found in mail headers. # +# Not recommended, unless you really have problems importing emls. # +# -yz Use this to force the order of the year and timezone in date in the From_ # +# line from the default [timezone][year] to [year][timezone]. # +# emlpath - Path of dir with eml files. Defaults to the current dir if not specified # +# trgtmbx - Name of the target mbox file. Defaults to "archive.mbox" in 'emlpath' # +# # +# Ruby homepage: http://www.ruby-lang.org/en/ # +# Unix mailbox format: http://www.broobles.com/eml2mbox/mbox.html # +# This script : http://www.broobles.com/eml2mbox # +# # +#============================================================================================# +# Licence: # +# # +# This script is free software; you can redistribute it and/or modify it under the terms of # +# the GNU Lesser General Public License as published by the Free Software Foundation; # +# either version 2.1 of the License, or (at your option) any later version. # +# # +# You should have received a copy of the GNU Lesser General Public License along with this # +# script; if not, please visit http://www.gnu.org/copyleft/gpl.html for more information. # +#============================================================================================# + +require "parsedate" + +include ParseDate + +#=======================================================# +# Class that encapsulates the processing file in memory # +#=======================================================# + +class FileInMemory + + ZoneOffset = { + # Standard zones by RFC 2822 + 'UTC' => '0000', + 'UT' => '0000', 'GMT' => '0000', + 'EST' => '-0500', 'EDT' => '-0400', + 'CST' => '-0600', 'CDT' => '-0500', + 'MST' => '-0700', 'MDT' => '-0600', + 'PST' => '-0800', 'PDT' => '-0700', + } + + def initialize() + @lines = Array.new + @counter = 1 # keep the 0 position for the From_ line + @from = nil # from part of the From_ line + @date = nil # date part of the From_ line + end + + def addLine(line) + # If the line is a 'false' From line, add a '>' to its beggining + line = line.sub(/From/, '>From') if line =~ /^From/ and @from!=nil + + # If the line is the first valid From line, save it (without the line break) + if line =~ /^From:\s.*@/ and @from==nil + @from = line.sub(/From:/,'From') + @from = @from.chop # Remove line break(s) + @from = standardizeFrom(@from) unless $switches["noStandardFromLine"] + end + + # Get the date + if $switches["noStandardFromLine"] + # Don't parse the content of the Date header + @date = line.sub(/Date:\s/,'') if line =~ /^Date:\s/ and @date==nil + else + if line =~ /^Date:\s/ and @date==nil + # Parse content of the Date header and convert to the mbox standard for the From_ line + @date = line.sub(/Date:\s/,'') + year, month, day, hour, minute, second, timezone, wday = parsedate(@date) + # Need to convert the timezone from a string to a 4 digit offset + unless timezone =~ /[+|-]\d*/ + timezone=ZoneOffset[timezone] + end + time = Time.gm(year,month,day,hour,minute,second) + @date = formMboxDate(time,timezone) + end + end + + # Now add the line to the array + line = fixLineEndings(line) + @lines[@counter]=line + @counter+=1 + end + + # Forms the first line (from + date) and returns all the lines + # Returns all the lines in the file + def getProcessedLines() + if @from != nil + # Add from and date to the first line + if @date==nil + puts "WARN: Failed to extract date. Will use current time in the From_ line" + @date=formMboxDate(Time.now,nil) + end + @lines[0] = @from + " " + @date + + @lines[0] = fixLineEndings(@lines[0]) + @lines[@counter] = "" + return @lines + end + # else don't return anything + end + + # Fixes CR/LFs + def fixLineEndings(line) + line = removeCR(line) if $switches["removeCRs"]; + line = removeLF(line) if $switches["removeLFs"]; + return line + end + + # emls usually have CR+LF (DOS) line endings, Unix uses LF as a line break, + # so there's a hanging CR at the end of the line when viewed on Unix. + # This method will remove the next to the last character from a line + def removeCR(line) + line = line[0..-3]+line[-1..-1] if line[-2]==0xD + return line + end + + # Similar to the above. This one is for Macs that use CR as a line break. + # So, remove the last char + def removeLF(line) + line = line[0..-2] if line[-1]==0xA + return line + end + +end + +#================# +# Helper methods # +#================# + +# Converts: 'From "some one " ' -> 'From aa@aa.aa' +def standardizeFrom(fromLine) + # Get indexes of last "<" and ">" in line + openIndex = fromLine.rindex('<') + closeIndex = fromLine.rindex('>') + if openIndex!=nil and closeIndex!=nil + fromLine = fromLine[0..4]+fromLine[openIndex+1..closeIndex-1] + end + # else leave as it is - it is either already well formed or is invalid + return fromLine +end + +# Returns a mbox postmark formatted date. +# If timezone is unknown, it is skipped. +# mbox date format used is described here: +# http://www.broobles.com/eml2mbox/mbox.html +def formMboxDate(time,timezone) + if timezone==nil + return time.strftime("%a %b %d %H:%M:%S %Y") + else + if $switches["zoneYearOrder"] + return time.strftime("%a %b %d %H:%M:%S "+timezone.to_s+" %Y") + else + return time.strftime("%a %b %d %H:%M:%S %Y "+timezone.to_s) + end + end +end + + +# Extracts all switches from the command line and returns +# a hashmap with valid switch names as keys and booleans as values +# Moves real params to the beggining of the ARGV array +def extractSwitches() + switches = Hash.new(false) # All switches (values) default to false + i=0 + while (ARGV[i]=~ /^-/) # while arguments are switches + if ARGV[i]=="-c" + switches["removeCRs"] = true + puts "\nWill fix lines ending with a CR" + elsif ARGV[i]=="-l" + switches["removeLFs"] = true + puts "\nWill fix lines beggining with a LF" + elsif ARGV[i]=="-s" + switches["noStandardFromLine"] = true + puts "\nWill use From and Date from mail headers in From_ line" + elsif ARGV[i]=="-yz" + switches["zoneYearOrder"] = true + puts "\nTimezone will be placed before the year in From_ line" + else + puts "\nUnknown switch: "+ARGV[i]+". Ignoring." + end + i = i+1 + end + # Move real arguments to the beggining of the array + ARGV[0] = ARGV[i] + ARGV[1] = ARGV[i+1] + return switches +end + +#===============# +# Main # +#===============# + + $switches = extractSwitches() + + # Extract specified directory with emls and the target archive (if any) + emlDir = "." # default if not specified + emlDir = ARGV[0] if ARGV[0]!=nil + mboxArchive = emlDir+"/archive.mbox" # default if not specified + mboxArchive = ARGV[1] if ARGV[1] != nil + + # Show specified settings + puts "\nSpecified dir : "+emlDir + puts "Specified file: "+mboxArchive+"\n" + + # Check that the dir exists + if FileTest.directory?(emlDir) + Dir.chdir(emlDir) + else + puts "\n["+emlDir+"] is not a directory (might not exist). Please specify a valid dir" + exit(0) + end + + # Check if destination file exists. If yes allow user to select an option. + canceled = false + if FileTest.exist?(mboxArchive) + print "\nFile ["+mboxArchive+"] exists! Please select: [A]ppend [O]verwrite [C]ancel (default) " + sel = STDIN.gets.chomp + if sel == 'A' or sel == 'a' + aFile = File.new(mboxArchive, "a"); + elsif sel == 'O' or sel == 'o' + aFile = File.new(mboxArchive, "w"); + else + canceled = true + end + else + # File doesn't exist, open for writing + aFile = File.new(mboxArchive, "w"); + end + + if not canceled + puts + files = Dir["*.eml"] + if files.size == 0 + puts "No *.eml files in this directory. mbox file not created." + aFile.close + File.delete(mboxArchive) + exit(0) + end + # For each .eml file in the specified directory do the following + files.each() do |x| + puts "Processing file: "+x + thisFile = FileInMemory.new() + File.open(x).each {|item| thisFile.addLine(item) } + lines = thisFile.getProcessedLines + if lines == nil + puts "WARN: File ["+x+"] doesn't seem to have a regular From: line. Not included in mbox" + else + lines.each {|line| aFile.puts line} + end + end + aFile.close + end -- cgit v1.2.3