#!/usr/bin/ruby

############################################################
# DocDiff: Document Diff
#   Created: Sat, 09 Dec 2000 +0900
#   Updated: Fri, 10 Aug 2001 +0900
#   Hisashi MORITA
#
############################################################
APPLICATION_NAME = "DocDiff"
DOCDIFF_VERSION = '0.2.0'
DOCDIFF_COPYRIGHT = "Copyleft (C) 2000-2001 Hisashi MORITA"

############################################################
# adding some DocDiff specific methods to Hash, though this pollutes namespace.
# DocDiff::Hash would be better, but this seems unable to override literal
# {'foo'=>'bar'} in script.  hmm.
############################################################
class Hash
  def attribute;           self['attribute'];           end
  def attribute= new_attr; self['attribute']= new_attr; end
  def value;               self['value'];               end
  def value= new_value;    self['value'] = new_value;   end
end

###########################################################
# newline (end of line) character handler
###########################################################
module NewLine

  CR      = "\x0d"
  LF      = "\x0a"
  CRLF    = "\x0d\x0a"
  NONE    = "NONE"
  UNKNOWN = "UNKNOWN"

  #########################################################
  # guess what newline character is in use.
  # return value: 'CR', 'LF', 'CRLF',
  #         'UNKNOWN'(binary), 'NONE'(1-line), nil(empty)
  #
  def guess_newline(string, sample_length = 4096)

    return nil if string == nil  # string is empty

    ########
    # procedure object to count newlines in a string.
    # .call(string, 0) means 'read till end'
    newline_count = Proc.new{|string, sample_len|
      patCR   = /(#{CR})(?!#{LF})/
      patLF   = /[^#{CR}](#{LF})/
      patCRLF = /(#{CR + LF})/
      # Read less to gain speed.
      sample = string[0..sample_len - 1]
      # count frequency of each newline character type.
      stat ={'CR' => {'type' => CR,
                      'count'=>sample.scan(patCR).size},
             'LF' => {'type' => LF,
                      'count'=>sample.scan(patLF).size},
             'CRLF'=>{'type' => CRLF,
                      'count'=>sample.scan(patCRLF).size},
            }.delete_if{|k, v| v['count'] == 0}  #; stat
              # Remove newline char that was not found.
    }

    stat = newline_count.call(string, sample_length)
    # How many flavors found?
    variety = stat.keys.size
    case variety
    when 1
      return stat.values[0]['type']  # CR, LF, or CRLF
    when 0
      # Really none?  Read the whole string and make sure.
      stat = newline_count.call(string, 0)
      variety = stat.keys.size
      case variety
      when 1;  return stat.values[0]['type']
      when 0;  return NONE           # 1-line file...
      else;    return UNKNOWN        # binary data...
      end
    else
      return UNKNOWN                 # binary data...
    end
  end
  module_function :guess_newline

end

############################################################
# GNU diff wrapper for DocDiff
# usage:
#   d = Diff.new('/usr/bin/diff')
#   # d.set_option('some_option')  # if needed
#   result = d.compare(ary_of_str, ary_of_str)
############################################################
class Diff

  require 'tempfile'

  ##########################################################
  #
  def initialize(diff_cmd)
    @diff_cmd = diff_cmd
    @diff_opt = ""
    # check what type of newline character diff uses.
    # diff/cygwin may use LF or CRLF.
    nl = NewLine::guess_newline(
      IO.popen("#{@diff_cmd} --help", "rb").read
    )
    case nl
    when NewLine::CR;    @diff_newline = NewLine::CR
    when NewLine::LF;    @diff_newline = NewLine::LF
    when NewLine::CRLF;  @diff_newline = NewLine::CRLF
    else
      raise "#{@diff_cmd} --help returned" +
            " #{nl} (expected CR, LF or CRLF).\n"
    end
  end

  ##########################################################
  #
  def set_option(diff_opt)
    @diff_opt = diff_opt
  end

  ##########################################################
  #
  def get_option
    @diff_opt
  end

  ##########################################################
  # raw diff wrapper
  #  args:
  #  ["A\n"," little\n"," bird;\n","\n","\n","And\n"," a\n"," tree.\n","\n"],
  #  ["A\n"," bird;\n","\n","\n","In\n"," a\n"," shady\n"," tree.\n","\n"]
  #  return:
  #  ["---*\n", "+++*\n", "@@*\n", 
  #   " A\n", "- little\n", "  bird;\n", " \n",
  #   " \n", 
  #   "-And\n", "+In\n", "  a\n", "+ shady\n", "  tree.\n", " \n"]
  #
  def diff(text1, text2, diff_opt)
    result = []

    tf1 = Tempfile.new("__FILE__")
    tf2 = Tempfile.new("__FILE__")

    tf1.print(text1)
    tf2.print(text2)
    tf1.close  # close only, not delete
    tf2.close

    sep_orig = $/
    $/ = @diff_newline
    IO.popen(
      "#{@diff_cmd} #{diff_opt} #{tf1.path} #{tf2.path}",
      "rb+"
    ) {|diff_process|
      result = Array(diff_process.read)
    }
    $/ = sep_orig

    ## old working code
    #diff_process = IO.popen(
    #  "#{@diff_cmd} #{diff_opt} #{tf1.path} #{tf2.path}",
    #  "rb+"
    #)
    #result = diff_process.readlines
    #diff_process.close

    tf1.close(true)  # delete
    tf2.close(true)
    return result
  end

  ##########################################################
  # tidy up raw diff output
  #
  # args:   ["A", " little", " bird;\n", "\n", "And", " a", " tree.\n"]
  #         ["A", " bird;\n", "\n", "In", " a", " shady", " tree.\n"]
  # return: [[" ","A"], ["-"," little"], [" "," bird;"], [" ","\n"],
  #         [" ","\n"], 
  #         ["-","And"], ["+","In"],
  #         [" "," a"], ["+"," shady"], [" "," tree."], [" ","\n"]]
  # warning: only ["foo","\n"] is accepted here, not ["foo\n"].
  #         you cannot just throw what you got from readlines().
  #
  def compare(text1, text2)

    if (text1.size == 0) || (text2.size == 0)
      raise "do not compare nils.\n"
    end

    nl1 = NewLine.guess_newline(text1.join)
    nl2 = NewLine.guess_newline(text2.join)

    if (nl1 != nl2)||(nl1 == NewLine::UNKNOWN)||
      (nl2 == NewLine::UNKNOWN)
      raise "Invalid newline:" +
            " #{nl1.inspect}, #{nl2.inspect}\n"
    end

    # max horizontal lines
    h_lines = ((text1.size + text2.size) / 2).to_i

    result = diff(
      text1.collect{|elem| escape_newline(elem, nl1)},
      text2.collect{|elem| escape_newline(elem, nl2)},
      "--unified=#{h_lines} --horizon-lines=#{h_lines} " +
      @diff_opt
    )[3..-1]  # cut off header (first 3 lines)

    result = result.collect{|item|
      mark = item[0..0]
      value = unescape_newline(item[1..-1], nl1)
      [mark, value]
    }

    return result
  end

  #########################################################
  # escape newline character
  # escape_newline("foo",  "\r\n") => "foo\n" on unix
  # escape_newline("\r\n", "\r\n") => "\n" on unix
  #
  def escape_newline(string, nl_text)
    case string
    when nl_text
      return @diff_newline
    when nil
      raise "nil is not acceptable: #{string.inspect}\n"
    else
      return (string + @diff_newline)
    end
  end

  #########################################################
  # unescape newline character
  # unescape_newline("foo\n", "\r\n") => "foo" on unix
  # unescape_newline("\n",    "\r\n") => "\r\n" on unix
  #
  def unescape_newline(string, nl_text)
    case string
    when @diff_newline
      return nl_text
    when nil
      raise "nil is not acceptable: #{string.inspect}\n"
    else
      return string.chomp(@diff_newline)
    end
  end

end  # end class Diff

############################################################
# ChaSen module for DocDiff
############################################################
module ChaSen

  require 'jcode'
  require 'tempfile'

  ##########################################################
  #
  #
  def chasen(string, chasen_cmdpath = 'chasen', chasen_option = "-F \"%m\\n\" -r ~/.chasenrc.docdiff")

    analyzed = []
    result = []

    tf = Tempfile.new(__FILE__)
    tf.print(string)
    tf.close
    cha = IO.popen("#{chasen_cmdpath} #{chasen_option} #{tf.path}", "rb+")
    analyzed << Array(cha.read)
    cha.close
    tf.close(true)
    analyzed.flatten!

    # unless it is empty line, chop off the end of line char.
    analyzed.collect! do |item|
      if item != "\n"
        item.chomp
      else
        item
      end
    end

    # add some stuff to separate out ',' and '.' at end of word.
    # ['Mr', '.'] rather than 'Mr.'
    analyzed.collect! do |item|
      if /[^\,]\,$/ =~ item
        [item[0..-2], ',']
      elsif /[^\.]\.$/ =~ item
        [item[0..-2], '.']
      elsif /[^\;]\;$/ =~ item
        [item[0..-2], ';']
      elsif /[^\:]\:$/ =~ item
        [item[0..-2], ':']
      else
        item
      end
    end
    analyzed.flatten!

    # space should be put to the beginning of next word.
    result = analyzed
    analyzed.each_with_index do |morpheme, i|
      if morpheme == " "
        result[i+1] = " " + analyzed[i+1]
        analyzed.delete_at(i)
      end
    end

    # if ChaSen added unneccessary "\n" at the end, chomp it.
    if (string == string.chomp) && (result[-1] == "\n")
      result.pop
    end

    return result

  end  # end chasen

end  # end module ChaSen

############################################################
# DocDiff cache file module
############################################################
module Cache

  require 'md5'

  ##########################################################
  # provide default cache directory path (ex: /tmp/username.progname.cache)
  #
  def default_cachedir
    "#{ENV['TMPDIR']||ENV['TMP']||ENV['TEMP']||'/tmp'}/#{ENV['USER']||'-'}.#{File.basename($0)||'-'}.cache"
  end
  module_function :default_cachedir

  ##########################################################
  # prepare cache directory, which is unique to user and program (not process)
  #
  def prepare_cachedir(cache_dir = default_cachedir())
    if FileTest.exist?(cache_dir) != true
      Dir.mkdir(cache_dir, 0700)
      return true
    elsif FileTest.directory?(cache_dir) != true
      raise "Cache directory(#{cache_dir}) is not a directory (#{File.ftype(cache_dir)}).\n"
    elsif FileTest.writable?(cache_dir) != true
      raise "Cache directory(#{cache_dir}) is not writable.\n"
    end
    return true
  end
  module_function :prepare_cachedir

  ##########################################################
  # delete cache directory (useless since it does not delete cache files!)
  #
  def remove_cachedir(cache_dir = default_cachedir())
    if FileTest.exist?(cache_dir)
      if FileTest.directory?(cache_dir)
        Dir.rmdir(cache_dir) 
        return true
      else
        return false
      end
    else
      return false
    end
  end
  module_function :remove_cachedir

  ##########################################################
  # check if cache file already exists
  #
  def cache_exist?(digest_seed, cache_dir = default_cachedir())
    cache_file = cache_dir.sub(/\/$/, '') +
                 "/" + MD5.new(digest_seed).hexdigest
    return File.exist?(cache_file)
  end
  module_function :cache_exist?

  ##########################################################
  # write cache file
  # as digest_seed, you have to give a string like methodname + args.inspect
  #
  def write_cache(data_to_write, digest_seed, cache_dir = default_cachedir())
    cache_file = cache_dir.sub(/\/$/, '') +
                 "/" + MD5.new(digest_seed).hexdigest
    Marshal.dump(data_to_write, File.open(cache_file, "wb+"))
    return cache_file
  end
  module_function :write_cache

  ##########################################################
  # read cache file
  #
  def read_cache(digest_seed, cache_dir = default_cachedir())
    cache_file = cache_dir.sub(/\/$/, '') +
                 "/" + MD5.new(digest_seed).hexdigest
    result = Marshal.load(File.open(cache_file, "rb+"))
    return result
  end
  module_function :read_cache

end  # end module Cache

############################################################
# DocDiff main class
############################################################
class DocDiff

  require 'nkf'
  require 'cgi'
  # require 'jcode'

  include ChaSen
  #include Cache

  ##########################################################
  # initialization
  #
  def initialize()
    @difference = Array.new
    @granularity    = 'word'
    @morphoanalysis = false
    @output_format  = 'html'
    @tag            = ''                      # <-implement this!
    @diff_cmd       = '/usr/bin/diff'
    @chasen_cmd     = '/usr/bin/chasen'
    @eol            = "\n"
    @rc_file        = '~/.docdiffrc'
    @chasen_rc      = '~/.chasenrc.docdiff'
    @cache          = 'on'
    @cache_dir      = '/tmp/' + ENV['USER']
  end
  attr_accessor :difference
  attr_accessor :granularity
  attr_accessor :morphoanalysis
  attr_accessor :output_format
  attr_accessor :tag
  attr_accessor :diff_cmd
  attr_accessor :chasen_cmd
  attr_accessor :eol
  attr_accessor :rc_file
  attr_accessor :chasen_rc
  attr_accessor :cache
  attr_accessor :cache_dir

  ##########################################################
  # publish Array functions of @difference
  #
  def [] index;    @difference[index];      end
  def push(item);  @difference.push(item);  end
  def pop;         @difference.pop;         end

  ##########################################################
  # import from diff output
  # imports [['-', "foo"], [' ', "\n"], ['+', "bar"], ..] into @difference
  #
  def import_diff(diff)
    new_difference = Array.new
    diff.each{|pair|
      new_difference.push({'attribute'=>pair[0], 'value'=>pair[1]})
    }
    @difference = new_difference
    new_difference
  end

  ##########################################################
  # get attributes( , -, +) as one long string
  #
  def attributes
    @difference.collect{|element| element.attribute}
  end

  ##########################################################
  #
  #
  def values
    @difference.collect{|element| element.value}
  end

  ##########################################################
  #
  #
  def compare(text1, text2, separator = "\n", granularity = "word", morphoanalysis = false)

    # create cache identification string from method name and arguments.
    cache_id = (
      'compare' + text1 + text2 +
      separator + granularity + morphoanalysis.to_s
    )

    # read cache
    if cache == 'on'
      if Cache.cache_exist?(cache_id, cache_dir)
        cache_data = Cache.read_cache(cache_id, cache_dir)
        @difference = cache_data
        return cache_data
      end
    end

    # call comparison sub-method, depending on granularity.
    result = []
    case granularity
    when "word"
      result = compare_by_word(text1, text2, morphoanalysis)
    when "char"
      result = compare_by_char(text1, text2)
    when "line"
      result = compare_by_line(text1, text2, separator = "\n")
    else
      raise "wrong type of granularity: #{granularity}"
    end
    @difference = result

    # write cache
    if cache == 'on'
      Cache.prepare_cachedir(cache_dir)
      Cache.write_cache(result, cache_id, cache_dir)
    end

    return result

  end

  ##########################################################
  #
  #
  def compare_2step(text1, text2, separator = "\n", granularity = "word", morphoanalysis = false)

    # screening unchanged part by comparing by line first, then by word/char.
    diff_by_line = Difference.new
    d = Diff.new(@diff_cmd)
    diff_by_line.load_diff(
      d.compare(
        split_by_line(text1, separator),
        split_by_line(text2, separator)
      )
    )
    position = diff_by_line.scan_changed
    replaced = Difference.new
    position.each{|pos|  # pos: [" ", [0, 0]] or ["!", [0, 0, 1, 1]]
      case pos[0]
      when " "
        replaced = replaced + diff_by_line[pos[1][0]..pos[1][1]]
        #p replaced
      when "!"
        case granularity
        when "word"
          replaced = replaced +
            compare_by_word(
              diff_by_line[pos[1][0]..pos[1][1]].collect{|e|e[1]}.to_s,
              diff_by_line[pos[1][2]..pos[1][3]].collect{|e|e[1]}.to_s,
              morphoanalysis
            )
            #p replaced
        when "char"
          replaced = replaced +
            compare_by_char(
              diff_by_line[pos[1][0]..pos[1][1]].collect{|e|e[1]}.to_s,
              diff_by_line[pos[1][2]..pos[1][3]].collect{|e|e[1]}.to_s
            )
        end  # end case granularity
      else
        raise "mark other than ' ' or '!' found in scan_changed() retval."
      end  # end case pos[0]
    }
    #exit
    @difference = replaced
    return replaced
  end

  ##########################################################
  # add diff_option and diff_cmdpath later
  #
  def compare_by_line(text1, text2, separator ="\n")
    result = []
    d = Diff.new(@diff_cmd)
    result = import_diff(
      d.compare(
        split_by_line(text1, separator),
        split_by_line(text2, separator)
      )
    )
    return result
  end

  ##########################################################
  #
  #
  def compare_by_word(text1, text2, morphoanalysis = false)
    result = []
    d = Diff.new(@diff_cmd)
    if morphoanalysis == true
      # add handling for diff_cmdpath, diff_option, chasen_cmdpath, chasen_option, later.
      result = import_diff(
        d.compare(
          split_by_morpheme(text1, guess_codeset(text1)),
          split_by_morpheme(text2, guess_codeset(text2))
        )
      )
    else
      result = import_diff(
        d.compare(
          split_by_word_approximate(text1, guess_codeset(text1)),
          split_by_word_approximate(text2, guess_codeset(text2))
        )
      )
    end
    return result
  end

  ##########################################################
  #
  #
  def compare_by_char(text1, text2)
    # add handling for diff_cmdpath, diff_option, later.
    result = []
    d = Diff.new(@diff_cmd)
    result = import_diff(
      d.compare(
        split_by_char(text1, guess_codeset(text1)),
        split_by_char(text2, guess_codeset(text2))
      )
    )
    return result
  end

  ##########################################################
  # Split string to lines
  # "Foo bar.\n\nBaz quux.\n" => ["Foo bar.","\n","\n","Baz quux.","\n"]
  #
  def split_by_line(text, separator)
    sep_save = $/
    if separator != nil
      $/ = separator
    else
      $/ = "\n"
    end
    elements = Array(text).collect do |item|
      re_entailing_eol = Regexp::compile('.+' + $/)
      re_eol_itself = Regexp::compile($/)
      if re_entailing_eol =~ item          # => if "foo$/" and not "$/"
        [item.sub(re_eol_itself, ''), $/]  # "foo$/" => ["foo","$/"]
      else                                 # => if "$/"
        [item]                             # "$/" => ["$/"]
      end
    end.flatten
    $/ = sep_save
    return elements
  end

  ##########################################################
  # Split string to words
  #
  def split_by_word(string, codeset = 'NONE', morphoanalysis = false)
    if morphoanalysis == true
      return split_by_morpheme(string, codeset)
    else
      return split_by_word_approximate(string, codeset)
    end
  end

  ##########################################################
  # Split string to words
  # (Japanese language does not have 'word', so this is approximate for
  #  Japanese.)
  # Note: Keep consistency with split_by_morpheme.
  #
  def split_by_word_approximate(text, codeset = 'NONE')

    save = $KCODE
    $KCODE = codeset

    # ASCII alphabet and number
    ub_alnum = '(?:[0-9A-Za-z_\-])'  # good-bye => good-bye
    # Symbols(excluding -) in ASCII text:
    # 0x20-0x2f( !"#$%&'()*+,-./), 0x3a-0x40(:;<=>?@),
    # 0x5b-0x5e([\]^), 0x60(`), 0x7b-0x7e({|}~)
    ub_symbol = '(?:(?:[\x20-\x2c])|(?:[\x2e-\x2f])|(?:[\x3a-\x40])|(?:[\x5b-\x5e])|(?:\x60)|(?:[\x7b-\x7e]))' # excluding '-'(0x2d)
    # ASCII control characters
    ub_control = '(?:[\x00-\x1f])'
    # EUC-ja unibyte katakana: 0x8e21-0x8e5f
    ub_kata = '(?:\x8e[\x21-\x5f])'
    # EUC-ja symbol (excluding macron("onbiki") and repeat("noma")):
    # 0xa1a1-0xa1b8, 0xa1ba-0xa1bb, 0xa1bd-0xa1fe, 0xa2a1-0xa2fe
    # macron is included in katakana, Noma in kanji.
    # �Υޡ�������̾�֤��������ʤҤ餬�ʡˡ����ʥ������ʡˡ���ɽ����
    mb_symbol = '(?:(?:\xa1[\xa1-\xb8\xba-\xbb\xbd-\xfe])|(?:\xa2[\xa1-\xfe]))'
    # EUC-ja multibyte alphabet and number: 0xa3b0-0xa3ff
    mb_alnum = '(?:\xa3[\xb0-\xff])'
    # EUC-ja 2-byte hiragana: 0xa4a1-0xa4fe
    mb_hira = '(?:\xa4[\xa1-\xfe])'
    # EUC-ja 2-byte katakana: 0xa5a1-0xa5fe, 0xa1bc(=macron)
    mb_kata = '(?:(?:\xa5[\xa1-\xfe])|(?:\xa1\xbc))'
    # EUC-ja Greek: 0xa6a1-0xa6fe
    mb_greek = '(?:\xa6[\xa1-\xfe])'
    # EUC-ja Cyrillic: 0xa7a1-0xa7fe
    mb_cyrillic = '(?:\xa7[\xa1-\xfe])'
    # EUC-ja box drawing symbol (=keisen): 0xa8a1-0xa8fe
    mb_boxdraw = '(?:\xa8[\xa1-\xfe])'
    # EUC-ja undefined: 0xa9a1-0xacfe, 0xaea1-0xaffe
    mb_undefined = '(?:(?:[\xa9-\xac][\xa1-\xfe])|(?:[\xae-\xaf][\xa1-\xfe]))'
    # EUC-ja NEC-only symbol: 0xada1-0xadfe
    mb_symbol_nec = '(?:\xad[\xa1-\xfe])'
    # EUC-ja kanji: 0xb0a1-0xfefe, 0xa1b9(="Kuma", 2nd symbol in "hitobito")
    # (actually includes undefined/NEC-kanji area)
    mb_kanji = '(?:(?:[\xb0-\xfe][\xa1-\xfe])|(?:\xa1\xb9))'

    pat_mb_kanhira = Regexp.new("^#{mb_kanji}+#{mb_hira}+")
    pat_mb_katahira = Regexp.new("^#{mb_kata}+#{mb_hira}+")
    pat_ub_alnum = Regexp.new("^#{ub_alnum}+")
    pat_ub_symbol = Regexp.new("^#{ub_symbol}")
    pat_ub_control = Regexp.new("^#{ub_control}")
    pat_ub_kata = Regexp.new("^#{ub_kata}+")
    pat_mb_symbol = Regexp.new("^#{mb_symbol}|#{mb_greek}|#{mb_cyrillic}|#{mb_boxdraw}|#{mb_symbol_nec}|#{mb_undefined}")  # I know this is messy...
    pat_mb_alnum = Regexp.new("^#{mb_alnum}+")
    pat_mb_hira = Regexp.new("^#{mb_hira}+")
    pat_mb_kata = Regexp.new("^#{mb_kata}+")
    pat_mb_kanji = Regexp.new("^#{mb_kanji}+")

    result = []
    Array(text).each do |string|
      while string.length > 0
        case
        when pat_ub_alnum.match(string)
          result.push(Regexp.last_match[0])
          string = Regexp.last_match.post_match
        when pat_ub_symbol.match(string)
          result.push(Regexp.last_match[0])
          string = Regexp.last_match.post_match
        when pat_ub_control.match(string)
          result.push(Regexp.last_match[0])
          string = Regexp.last_match.post_match
        when pat_ub_kata.match(string)
          result.push(Regexp.last_match[0])
          string = Regexp.last_match.post_match
        when pat_mb_kanhira.match(string)  # experimental
          result.push(Regexp.last_match[0])
          string = Regexp.last_match.post_match
        when pat_mb_katahira.match(string)  # experimental
          result.push(Regexp.last_match[0])
          string = Regexp.last_match.post_match
        when pat_mb_alnum.match(string)
          result.push(Regexp.last_match[0])
          string = Regexp.last_match.post_match
        when pat_mb_hira.match(string)
          result.push(Regexp.last_match[0])
          string = Regexp.last_match.post_match
        when pat_mb_kata.match(string)
          result.push(Regexp.last_match[0])
          string = Regexp.last_match.post_match
        when pat_mb_kanji.match(string)
          result.push(Regexp.last_match[0])
          string = Regexp.last_match.post_match
        when pat_mb_symbol.match(string)
          result.push(Regexp.last_match[0])
          string = Regexp.last_match.post_match
        else
          result.push(string[0..0])
          string = string[1..-1]
        end # end case
      end # end while
    end # end do

=begin
# clean up above using RE list
AL = "(^[A-Za-z]+)";         NUM = "(^[0-9]+)"
RE_AL = Regexp.compile(AL);  RE_NUM = Regexp.compile(NUM)
RE_LIST = [RE_AL, RE_NUM]

string = "I'm 10-years old.  " * 150
result = []

t = Time.now

while string.length > 0
  matched_any = false
  RE_LIST.each{|re|
    if md = re.match(string)
      result.push(md[0])
      string = md.post_match
      matched_any = true
      break  # break each
    end
  }
  if matched_any == false
    result.push(string[0..0])
    string = string[1..-1]
  end
end

=end

    # " ", "foo" => " foo"
    result.each_with_index do |item, i|
      if (result[i-1] == " ") && (result[i] != " ")
        result[i] = result[i-1] + result[i]
        result.delete_at(i-1)
      end
    end

    $KCODE = save
    return result
  end

  ##########################################################
  # Split string to morpheme
  #
  def split_by_morpheme(string, codeset = 'NONE')
    if codeset.upcase != 'NONE'
      require 'jcode'
    end
    save = $KCODE
    $KCODE = codeset

    result = chasen(string)

    $KCODE = save
    return result
  end

  ##########################################################
  # Split string to characters
  #
  def split_by_char(string, codeset = 'NONE')
    if codeset.upcase != 'NONE'
      require 'jcode'
    end
    save = $KCODE
    $KCODE = codeset
    result = string.split(//)
    $KCODE = save
    return result
  end

  ##########################################################
  # Guess codeset of given string
  #
  def guess_codeset(string)
    #require 'nkf'
    codes = {
      NKF::JIS     => "JIS",
      NKF::EUC     => "EUC",
      NKF::SJIS    => "SJIS",
      NKF::BINARY  => "BINARY",
      NKF::UNKNOWN => "UNKNOWN(ASCII)"
    }
    codes[NKF::guess(string)]
  end

  ##########################################################
  # End-of-line character detecter
  # Returns 'CR', 'LF', 'CRLF', 'NONE', 'UNKNOWN', or nil.
  #
  def guess_eol(string, sample_length = 4096)

    if string == nil  # string is empty
      return nil
    end

    sample = string[0 .. (sample_length - 1)]  # Read less to gain speed.
    eol_stat = {  # count of each EOL character.
      'CR'   => sample.scan(/(\r)(?!\n)/).size,
      'LF'   => sample.scan(/[^\r](\n)/).size,
      'CRLF' => sample.scan(/(\r\n)/).size
    }
    eol_stat.delete_if{|k,v| v == 0}  # Remove EOL that was not found.

    eol_variety = eol_stat.keys.size  # How many flavors found?
    if eol_variety == 1       # Only one type of EOL was found.
      return eol_stat.keys[0] # => 'CR', 'LF', or 'CRLF'
    elsif eol_variety == 0    # No EOL found.  Might be 1-line file.
      return 'NONE'
    else  # Multiple types of EOL found. (Maybe binary data.)
      # sorted_keys = eol_stat.keys.sort {|a,b|eol_stat[b] <=> eol_stat[a]}
      # tmp = sorted_keys.collect {|k| [k, eol_stat[k]]}.join(' ')
      # return "BIN (#{tmp})"  # This is for debugging.
      return 'UNKNOWN'
    end

  end

  ##########################################################
  # Collect succeeding elements with same attribute, and put them together.
  # [[" ", "foo"], [" ", "bar"]] => [[" ", "foobar"],..]
  # Note: "\n" is always treated as an independent element.
  #
  def group_simple
    grouped = []
    grouped.push(@difference[0])
    for i in (1 .. (@difference.size - 1))
      if (@difference[i].attribute == @difference[i - 1].attribute) &&
         (difference[i].value != "\n")
        last_element = grouped.pop
        grouped.push(
          {
            'attribute'=>last_element.attribute,
            'value'=>(last_element.value + @difference[i].value)
          }
        )
      else
        # [..,[" ", "foobar"]] << ["+", "baz"]
        grouped.push(@difference[i])
      end
    end
    return grouped
  end

  ##########################################################
  # Format difference data using HTML
  #
  def format_html(specified_tags={}, title='', codeset='')

    # default HTML tags
    tags = {
      'doc_type'     => '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN">',
      'doc_begin'    => '<html>',
      'doc_end'      => '</html>',
      'head_begin'   => '<head>',
      'head_end'     => '</head>',
      'meta'         => '<meta http-equiv="Content-Type" content="text/html">',
      'title_begin'  => '<title>',
      'title_end'    => '</title>',
      'body_begin'   => '<body>',
      'body_end'     => '</body>',
      'remove_begin' => '<del>',
      'remove_end'   => '</del>',
      'add_begin'    => '<ins>',
      'add_end'      => '</ins>',
      'end_of_line'  => '<br>'
    }

    case codeset
    when 'EUC'
      tags['meta'] = '<meta http-equiv="Content-Type" content="text/html; charset=EUC-JP">'
    when 'JIS'
      tags['meta'] = '<meta http-equiv="Content-Type" content="text/html; charset=iso-2022-jp">'
    when 'SJIS'
      tags['meta'] = '<meta http-equiv="Content-Type" content="text/html; charset=Shift_JIS">'
    when 'ASCII'
      tags['meta'] = '<meta http-equiv="Content-Type" content="text/html; charset=us-ascii">'
    else
      tags['meta'] = '<meta http-equiv="Content-Type" content="text/html">'
    end

    # merge user-specified tags into pre-defined default tags.
    tags.update(specified_tags)

    header = (
      "#{tags['doc_type']}\n#{tags['doc_begin']}\n" +
      " #{tags['head_begin']}\n" +
      "  #{tags['meta']}\n" +
      "  #{tags['title_begin']}#{title}#{tags['title_end']}\n" +
      " #{tags['head_end']}\n" +
      " #{tags['body_begin']}\n"
    )
    footer =(
      "\n" +
      " #{tags['body_end']}\n" +
      "#{tags['doc_end']}\n"
    )

    body = group_for_html.collect do |elem|
      case elem.attribute
      when ' '
        escape_html(elem.value).gsub(/\n/, tags['end_of_line'])
      when '-'
        (tags['remove_begin'] +
         escape_html(elem.value).gsub(/\n/, tags['end_of_line']) +
         tags['remove_end'])
      when '+'
        (tags['add_begin'] +
         escape_html(elem.value).gsub(/\n/, tags['end_of_line']) +
         tags['add_end'])
      else
        escape_html(elem.value).gsub(/\n/, tags['end_of_line'])
      end
    end
    body = body.to_s

    result = header + body + footer
    return result

  end

  ##########################################################
  # Group "difference" for HTML output
  #
  def group_for_html
    group_simple
  end

  ##########################################################
  # examine "difference" and return position info about changed part (-+)
  #
  def scan_changed

    attr_str = attributes.to_s  #=> " ", "-", "+", ...  #=>" -+..."
    cursor = 0
    result = []

    while attr_str.length > cursor
      if /\-+\++/.match(attr_str[cursor..-1])
        # record the stuff between the last and curent match
        if $~.pre_match != ''  # consider '!' at beginning
          result.push(
            [" ", $~.pre_match,
             [cursor, (cursor + $~.pre_match.length - 1)]
            ]
          )  #=> [" ", " -  + ", [0, 5]]
          # move the cursor to the beginning of the match
          cursor = cursor + $~.pre_match.length
        end

        minus_length = $~[0].tr('+',"").size  #=> "--".length
        plus_length  = $~[0].tr('-',"").size  #=> "+".length

        # record the current matched
        result.push(
          ["!", $~[0], 
           [cursor,
            (cursor + minus_length - 1),
            (cursor + minus_length),
            (cursor + minus_length + plus_length - 1)
           ]
          ]
        )  #=> ["!", "--+", [6, 7, 8, 8]]
        # move the cursor to the point just after the match
        cursor = cursor + $~[0].length
      else  # no more "-+"s, so record the rest
        result.push(
          [" ", attr_str[cursor..-1],
           [cursor, (attr_str.length - 1)]
          ]
        )  #=> [" ", " -", [14, 15]]
        break
      end
    end

    # at this point:
    # [[" ", " -  + ", [0, 5]], 
    #  ["!", "--+", [6, 7, 8, 8]], 
    #  [" ", " ", [9, 9]], 
    #  ["!", "-+++", [10, 10, 11, 13]], 
    #  [" ", " -", [14, 15]]]

    # cut off excessive info
    result.collect! do |item|
      [item[0], item[2]]
    end

    return result
    # result will look like:
    # [[" ", [0, 5]], 
    #  ["!", [6, 7, 8, 8]], 
    #  [" ", [9, 9]], 
    #  ["!", [10, 10, 11, 13]], 
    #  [" ", [14, 15]]]
  end

  ##########################################################
  # Escape HTML special characters ('<' => '&lt;')
  #
  def escape_html(string)
    #require 'cgi'
    return CGI::escapeHTML(string)
  end

  ##########################################################
  # Unescape HTML special characters ('&lt;' => '<')
  #
  def unescape_html(string)
    #require 'cgi';
    return CGI::unescapeHTML(string)
  end

  ##########################################################
  # Format difference data using escape sequence
  # title is just ignored in format_escape().
  #
  def format_escape(specified_tags={}, title='no title', codeset='')

    # default escape sequence, or 'tag'
    tags = {
      'remove_begin' => "\033[7m",   # xor on
      'remove_end'   => "\033[27m",  # xor off
      'add_begin'    => "\033[4m",   # underline on
      'add_end'      => "\033[24m",  # underline off
      'end_of_line'  => "\n"
    }

    # merge user-specified tags into pre-defined default tags.
    tags.update(specified_tags)

    body = group_for_escape.collect do |elem|
      case elem.attribute
      when ' '
        elem.value.gsub(/\n/, tags['end_of_line'])
      when '-'
        (tags['remove_begin'] +
         elem.value.gsub(/\n/, tags['end_of_line']) +
         tags['remove_end'])
      when '+'
        (tags['add_begin'] +
         elem.value.gsub(/\n/, tags['end_of_line']) +
         tags['add_end'])
      else
        elem.value.gsub(/\n/, tags['end_of_line'])
      end
    end
    return body.to_s

  end

  ##########################################################
  #
  #
  def group_for_escape
    group_simple
  end

  ##########################################################
  #
  #
  def format_debug
    @difference.inspect
  end

=begin not implemented yet
  ##########################################################
  #
  #
  def format_manued
    
  end

  ##########################################################
  #
  #
  def group_for_manued
    group_complex
  end

  ##########################################################
  #
  #
  def group_complex
    
  end

  ##########################################################
  #
  #
  def format_xhtml
    #
  end

  ##########################################################
  #
  #
  def group_for_xhtml
    group_simple
  end

=end not implemented yet

  ##########################################################
  # below are the methods to be used by application.
  ##########################################################

  ##########################################################
  # version, copyright
  #
  def version
    "#{APPLICATION_NAME} #{DOCDIFF_VERSION}\n#{DOCDIFF_COPYRIGHT}"
  end

  ##########################################################
  # usage information
  #
  def usage
    return <<-EndOfMessage.gsub(/^    /,'')
    Usage: #{File.basename($0)} [OPTION..] FILE1 FILE2
    OPTION:
      -g  --granularity UNIT     Set comparison granularity.
                                   UNIT:   word    word (default)
                                           char    character
                                           line    line
      -m  --morphoanalysis MODE  Set morphoanalysis mode.
                                   MODE:   auto    on if available (default)
                                           on      force on
                                           off     force off
      -f  --format FORMAT        Specify output format.
                                   FORMAT: html    HTML (default)
                                           xhtml   XHTML
                                           esc     escape sequence
                                           manued  Manued
                                           debug   Ruby object dump
      --tag TAGS                 Set mark-up tags.
      --diff PATH_TO_DIFF        Specify diff command.
      --chasen PATH_TO_CHASEN    Specify chasen command.
      --rc RC_FILE               Specify DocDiff configuration file.
      --chasenrc CHASEN_RC       Specify ChaSen configuration file.
      --cache MODE               Enable/disable cache.
                                   MODE:   on      enable cache (default)
                                           off     disable cache
      --cachedir CACHE_DIR       Specify cache directory.
      -h  --help                 Output help message.
      -v  --version              Output version information.
    EndOfMessage
  end

end  # end of class DocDiff

############################################################
# DocDiff Application part
############################################################
if $0 == __FILE__

# GC.disable  # not much performance gain (3.7sec->3.4sec). hmm.

  docdiff = DocDiff.new

  if ARGV == []
    $stderr.print(docdiff.usage)
    exit
  end

  ##########################################################
  # process command line option
  #
  require 'getoptlong'
  getoptlong = GetoptLong.new(
    ['--granularity',    '-g', GetoptLong::REQUIRED_ARGUMENT],
    ['--morphoanalysis', '-m', GetoptLong::REQUIRED_ARGUMENT],
    ['--format',         '-f', GetoptLong::REQUIRED_ARGUMENT],
    ['--tag',                  GetoptLong::REQUIRED_ARGUMENT],
    ['--diff',                 GetoptLong::REQUIRED_ARGUMENT],
    ['--chasen',               GetoptLong::REQUIRED_ARGUMENT],
    ['--rc',                   GetoptLong::REQUIRED_ARGUMENT],
    ['--chasenrc',             GetoptLong::REQUIRED_ARGUMENT],
    ['--cache',                GetoptLong::REQUIRED_ARGUMENT],
    ['--cachedir',             GetoptLong::REQUIRED_ARGUMENT],
    ['--help',           '-h', GetoptLong::NO_ARGUMENT],
    ['--version',        '-v', GetoptLong::NO_ARGUMENT]
  )
  conf_cmdline = Hash.new
  begin
    getoptlong.each do |optname, optarg|
      case optname
      when '--granularity';     conf_cmdline['granularity']    = optarg
      when '--morphoanalysis';  conf_cmdline['morphoanalysis'] = optarg
      when '--format';          conf_cmdline['output_format']  = optarg
      when '--tag';             conf_cmdline['tag']            = optarg
      when '--diff';            conf_cmdline['diff_cmd']       = optarg
      when '--chasen';          conf_cmdline['chasen_cmd']     = optarg
      when '--rc';              conf_cmdline['rc_file']        = optarg
      when '--chasenrc';        conf_cmdline['chasen_rc']      = optarg
      when '--cache';           conf_cmdline['cache']          = optarg
      when '--cachedir';        conf_cmdline['cache_dir']      = optarg
      when '--help';
        # $stderr.print(docdiff.usage)  # you want "docdiff.rb -h | more", huh?
        print(docdiff.usage)
        exit
      when '--version'
        # $stderr.print("#{docdiff.version}\n")
        print("#{docdiff.version}\n")
        exit
      end
    end
  rescue
    $stderr.print(docdiff.usage)
    exit(1)
  end

  ##########################################################
  # read configuration from rc file
  #
  conf_rc = Hash.new

  module Conf
    # guarantee that constant 'Conf' exists even when no rc file is loaded.
  end

  if conf_cmdline['rc_file'] != nil  # if rcfile is specified on command line,
    if FileTest.exist?(conf_cmdline['rc_file'])
      load conf_cmdline['rc_file']   # load specified config file.
    else
      raise "Failed to read .docdiffrc file: #{conf_cmdline['rc_file']}\n"
    end
  elsif FileTest.exist?('~/.docdiffrc')
    load '~/.docdiffrc'
  # else
  #   $stderr.print "No .docdiffrc found.  Default configuration is used.\n"
  end

  conf_rc['granularity']    = Conf::GRANULARITY    if Conf::const_defined?(:GRANULARITY)
  conf_rc['morphoanalysis'] = Conf::MORPHOANALYSIS if Conf::const_defined?(:MORPHOANALYSIS)
  conf_rc['output_format']  = Conf::OUTPUT_FORMAT  if Conf::const_defined?(:OUTPUT_FORMAT)
  conf_rc['tag']        = Conf::TAG        if Conf::const_defined?(:TAG)
  conf_rc['diff_cmd']   = Conf::DIFF_CMD   if Conf::const_defined?(:DIFF_CMD)
  conf_rc['chasen_cmd'] = Conf::CHASEN_CMD if Conf::const_defined?(:CHASEN_CMD)
  conf_rc['chasen_rc']  = Conf::CHASEN_RC  if Conf::const_defined?(:CHASEN_RC)
  conf_rc['cache']      = Conf::CACHE      if Conf::const_defined?(:CACHE)
  conf_rc['cache_dir']  = Conf::CACHE_DIR  if Conf::const_defined?(:CACHE_DIR)
  #conf_rc = conf_rc.delete_if {|k, v| v == nil} # obsolete debug code

  ##########################################################
  # set default configuration
  #
  conf_default = Hash.new
  conf_default['granularity'] = 'word'
  conf_default['morphoanalysis'] = 'auto'
  conf_default['output_format'] = 'html'
  conf_default['diff_cmd'] = 'diff'
  conf_default['chasen_cmd'] = 'chasen'
  conf_default['rc_file'] = '~/.docdiffrc'
  conf_default['chasen_rc'] = '~/.chasenrc.docdiff'
  conf_default['cache'] = 'on'
  conf_default['cache_dir'] = Cache.default_cachedir

  ##########################################################
  # merge command line option to config read from file (overwrite)
  #
  conf_runtime = Hash.new
  conf_runtime.update(conf_default)
  conf_runtime.update(conf_rc)      # overwrite with rc file
  conf_runtime.update(conf_cmdline) # overwrite with command line option

  # morphoanalysis auto -> on or off
  case conf_runtime['morphoanalysis']
  when 'on'
    conf_runtime['morphoanalysis'] = true
  when 'off'
    conf_runtime['morphoanalysis'] = false
  when 'auto'
    if FileTest.executable?(conf_runtime['chasen_cmd']) && (conf_runtime['chasen_rc'] != nil)
      conf_runtime['morphoanalysis'] = true
    else
      conf_runtime['morphoanalysis'] = false
    end
  else
    raise "unsupported morphoanalysis value:#{conf_runtime['morphoanalysis'].inspect}\n"
  end

#  p conf_runtime; exit # for debug

  ##########################################################
  # apply configuration
  #
  docdiff.granularity    = conf_runtime['granularity']
  docdiff.morphoanalysis = conf_runtime['morphoanalysis']
  docdiff.output_format  = conf_runtime['output_format']
  docdiff.tag            = conf_runtime['tag']
  docdiff.diff_cmd       = conf_runtime['diff_cmd']
  docdiff.chasen_cmd     = conf_runtime['chasen_cmd']
  docdiff.chasen_rc      = conf_runtime['chasen_rc']
  docdiff.cache          = conf_runtime['cache']
  docdiff.cache_dir      = conf_runtime['cache_dir']

  ##########################################################
  # read documents, compare, then output the difference
  #

  doc = []
  ARGV.each_with_index do |arg, i|
    doc[i] = File.open(arg, 'rb').read
  end

  docdiff.compare(doc[0], doc[1], "\n", docdiff.granularity, docdiff.morphoanalysis)

  case docdiff.output_format
  when 'html'
    print docdiff.format_html({}, "#{ARGV[0]} (#{File.stat(ARGV[0]).mtime}), #{ARGV[1]} (#{File.stat(ARGV[1]).mtime})", '')
  when 'esc'
    print docdiff.format_escape
  when 'debug'
    print docdiff.format_debug
  end

end  # end if $0 == __FILE__