#!/usr/bin/ruby ############################################################ # DocDiff: Document Diff # Created: Sat, 09 Dec 2000 +0900 # Updated: Fri, 10 Aug 2001 +0900 # Hisashi MORITA # ############################################################ APPLICATION_NAME = "DocDiff" DOCDIFF_VERSION = '0.2.0' DOCDIFF_COPYRIGHT = "Copyleft (C) 2000-2001 Hisashi MORITA" ############################################################ # adding some DocDiff specific methods to Hash, though this pollutes namespace. # DocDiff::Hash would be better, but this seems unable to override literal # {'foo'=>'bar'} in script. hmm. ############################################################ class Hash def attribute; self['attribute']; end def attribute= new_attr; self['attribute']= new_attr; end def value; self['value']; end def value= new_value; self['value'] = new_value; end end ########################################################### # newline (end of line) character handler ########################################################### module NewLine CR = "\x0d" LF = "\x0a" CRLF = "\x0d\x0a" NONE = "NONE" UNKNOWN = "UNKNOWN" ######################################################### # guess what newline character is in use. # return value: 'CR', 'LF', 'CRLF', # 'UNKNOWN'(binary), 'NONE'(1-line), nil(empty) # def guess_newline(string, sample_length = 4096) return nil if string == nil # string is empty ######## # procedure object to count newlines in a string. # .call(string, 0) means 'read till end' newline_count = Proc.new{|string, sample_len| patCR = /(#{CR})(?!#{LF})/ patLF = /[^#{CR}](#{LF})/ patCRLF = /(#{CR + LF})/ # Read less to gain speed. sample = string[0..sample_len - 1] # count frequency of each newline character type. stat ={'CR' => {'type' => CR, 'count'=>sample.scan(patCR).size}, 'LF' => {'type' => LF, 'count'=>sample.scan(patLF).size}, 'CRLF'=>{'type' => CRLF, 'count'=>sample.scan(patCRLF).size}, }.delete_if{|k, v| v['count'] == 0} #; stat # Remove newline char that was not found. } stat = newline_count.call(string, sample_length) # How many flavors found? variety = stat.keys.size case variety when 1 return stat.values[0]['type'] # CR, LF, or CRLF when 0 # Really none? Read the whole string and make sure. stat = newline_count.call(string, 0) variety = stat.keys.size case variety when 1; return stat.values[0]['type'] when 0; return NONE # 1-line file... else; return UNKNOWN # binary data... end else return UNKNOWN # binary data... end end module_function :guess_newline end ############################################################ # GNU diff wrapper for DocDiff # usage: # d = Diff.new('/usr/bin/diff') # # d.set_option('some_option') # if needed # result = d.compare(ary_of_str, ary_of_str) ############################################################ class Diff require 'tempfile' ########################################################## # def initialize(diff_cmd) @diff_cmd = diff_cmd @diff_opt = "" # check what type of newline character diff uses. # diff/cygwin may use LF or CRLF. nl = NewLine::guess_newline( IO.popen("#{@diff_cmd} --help", "rb").read ) case nl when NewLine::CR; @diff_newline = NewLine::CR when NewLine::LF; @diff_newline = NewLine::LF when NewLine::CRLF; @diff_newline = NewLine::CRLF else raise "#{@diff_cmd} --help returned" + " #{nl} (expected CR, LF or CRLF).\n" end end ########################################################## # def set_option(diff_opt) @diff_opt = diff_opt end ########################################################## # def get_option @diff_opt end ########################################################## # raw diff wrapper # args: # ["A\n"," little\n"," bird;\n","\n","\n","And\n"," a\n"," tree.\n","\n"], # ["A\n"," bird;\n","\n","\n","In\n"," a\n"," shady\n"," tree.\n","\n"] # return: # ["---*\n", "+++*\n", "@@*\n", # " A\n", "- little\n", " bird;\n", " \n", # " \n", # "-And\n", "+In\n", " a\n", "+ shady\n", " tree.\n", " \n"] # def diff(text1, text2, diff_opt) result = [] tf1 = Tempfile.new("__FILE__") tf2 = Tempfile.new("__FILE__") tf1.print(text1) tf2.print(text2) tf1.close # close only, not delete tf2.close sep_orig = $/ $/ = @diff_newline IO.popen( "#{@diff_cmd} #{diff_opt} #{tf1.path} #{tf2.path}", "rb+" ) {|diff_process| result = Array(diff_process.read) } $/ = sep_orig ## old working code #diff_process = IO.popen( # "#{@diff_cmd} #{diff_opt} #{tf1.path} #{tf2.path}", # "rb+" #) #result = diff_process.readlines #diff_process.close tf1.close(true) # delete tf2.close(true) return result end ########################################################## # tidy up raw diff output # # args: ["A", " little", " bird;\n", "\n", "And", " a", " tree.\n"] # ["A", " bird;\n", "\n", "In", " a", " shady", " tree.\n"] # return: [[" ","A"], ["-"," little"], [" "," bird;"], [" ","\n"], # [" ","\n"], # ["-","And"], ["+","In"], # [" "," a"], ["+"," shady"], [" "," tree."], [" ","\n"]] # warning: only ["foo","\n"] is accepted here, not ["foo\n"]. # you cannot just throw what you got from readlines(). # def compare(text1, text2) if (text1.size == 0) || (text2.size == 0) raise "do not compare nils.\n" end nl1 = NewLine.guess_newline(text1.join) nl2 = NewLine.guess_newline(text2.join) if (nl1 != nl2)||(nl1 == NewLine::UNKNOWN)|| (nl2 == NewLine::UNKNOWN) raise "Invalid newline:" + " #{nl1.inspect}, #{nl2.inspect}\n" end # max horizontal lines h_lines = ((text1.size + text2.size) / 2).to_i result = diff( text1.collect{|elem| escape_newline(elem, nl1)}, text2.collect{|elem| escape_newline(elem, nl2)}, "--unified=#{h_lines} --horizon-lines=#{h_lines} " + @diff_opt )[3..-1] # cut off header (first 3 lines) result = result.collect{|item| mark = item[0..0] value = unescape_newline(item[1..-1], nl1) [mark, value] } return result end ######################################################### # escape newline character # escape_newline("foo", "\r\n") => "foo\n" on unix # escape_newline("\r\n", "\r\n") => "\n" on unix # def escape_newline(string, nl_text) case string when nl_text return @diff_newline when nil raise "nil is not acceptable: #{string.inspect}\n" else return (string + @diff_newline) end end ######################################################### # unescape newline character # unescape_newline("foo\n", "\r\n") => "foo" on unix # unescape_newline("\n", "\r\n") => "\r\n" on unix # def unescape_newline(string, nl_text) case string when @diff_newline return nl_text when nil raise "nil is not acceptable: #{string.inspect}\n" else return string.chomp(@diff_newline) end end end # end class Diff ############################################################ # ChaSen module for DocDiff ############################################################ module ChaSen require 'jcode' require 'tempfile' ########################################################## # # def chasen(string, chasen_cmdpath = 'chasen', chasen_option = "-F \"%m\\n\" -r ~/.chasenrc.docdiff") analyzed = [] result = [] tf = Tempfile.new(__FILE__) tf.print(string) tf.close cha = IO.popen("#{chasen_cmdpath} #{chasen_option} #{tf.path}", "rb+") analyzed << Array(cha.read) cha.close tf.close(true) analyzed.flatten! # unless it is empty line, chop off the end of line char. analyzed.collect! do |item| if item != "\n" item.chomp else item end end # add some stuff to separate out ',' and '.' at end of word. # ['Mr', '.'] rather than 'Mr.' analyzed.collect! do |item| if /[^\,]\,$/ =~ item [item[0..-2], ','] elsif /[^\.]\.$/ =~ item [item[0..-2], '.'] elsif /[^\;]\;$/ =~ item [item[0..-2], ';'] elsif /[^\:]\:$/ =~ item [item[0..-2], ':'] else item end end analyzed.flatten! # space should be put to the beginning of next word. result = analyzed analyzed.each_with_index do |morpheme, i| if morpheme == " " result[i+1] = " " + analyzed[i+1] analyzed.delete_at(i) end end # if ChaSen added unneccessary "\n" at the end, chomp it. if (string == string.chomp) && (result[-1] == "\n") result.pop end return result end # end chasen end # end module ChaSen ############################################################ # DocDiff cache file module ############################################################ module Cache require 'md5' ########################################################## # provide default cache directory path (ex: /tmp/username.progname.cache) # def default_cachedir "#{ENV['TMPDIR']||ENV['TMP']||ENV['TEMP']||'/tmp'}/#{ENV['USER']||'-'}.#{File.basename($0)||'-'}.cache" end module_function :default_cachedir ########################################################## # prepare cache directory, which is unique to user and program (not process) # def prepare_cachedir(cache_dir = default_cachedir()) if FileTest.exist?(cache_dir) != true Dir.mkdir(cache_dir, 0700) return true elsif FileTest.directory?(cache_dir) != true raise "Cache directory(#{cache_dir}) is not a directory (#{File.ftype(cache_dir)}).\n" elsif FileTest.writable?(cache_dir) != true raise "Cache directory(#{cache_dir}) is not writable.\n" end return true end module_function :prepare_cachedir ########################################################## # delete cache directory (useless since it does not delete cache files!) # def remove_cachedir(cache_dir = default_cachedir()) if FileTest.exist?(cache_dir) if FileTest.directory?(cache_dir) Dir.rmdir(cache_dir) return true else return false end else return false end end module_function :remove_cachedir ########################################################## # check if cache file already exists # def cache_exist?(digest_seed, cache_dir = default_cachedir()) cache_file = cache_dir.sub(/\/$/, '') + "/" + MD5.new(digest_seed).hexdigest return File.exist?(cache_file) end module_function :cache_exist? ########################################################## # write cache file # as digest_seed, you have to give a string like methodname + args.inspect # def write_cache(data_to_write, digest_seed, cache_dir = default_cachedir()) cache_file = cache_dir.sub(/\/$/, '') + "/" + MD5.new(digest_seed).hexdigest Marshal.dump(data_to_write, File.open(cache_file, "wb+")) return cache_file end module_function :write_cache ########################################################## # read cache file # def read_cache(digest_seed, cache_dir = default_cachedir()) cache_file = cache_dir.sub(/\/$/, '') + "/" + MD5.new(digest_seed).hexdigest result = Marshal.load(File.open(cache_file, "rb+")) return result end module_function :read_cache end # end module Cache ############################################################ # DocDiff main class ############################################################ class DocDiff require 'nkf' require 'cgi' # require 'jcode' include ChaSen #include Cache ########################################################## # initialization # def initialize() @difference = Array.new @granularity = 'word' @morphoanalysis = false @output_format = 'html' @tag = '' # <-implement this! @diff_cmd = '/usr/bin/diff' @chasen_cmd = '/usr/bin/chasen' @eol = "\n" @rc_file = '~/.docdiffrc' @chasen_rc = '~/.chasenrc.docdiff' @cache = 'on' @cache_dir = '/tmp/' + ENV['USER'] end attr_accessor :difference attr_accessor :granularity attr_accessor :morphoanalysis attr_accessor :output_format attr_accessor :tag attr_accessor :diff_cmd attr_accessor :chasen_cmd attr_accessor :eol attr_accessor :rc_file attr_accessor :chasen_rc attr_accessor :cache attr_accessor :cache_dir ########################################################## # publish Array functions of @difference # def [] index; @difference[index]; end def push(item); @difference.push(item); end def pop; @difference.pop; end ########################################################## # import from diff output # imports [['-', "foo"], [' ', "\n"], ['+', "bar"], ..] into @difference # def import_diff(diff) new_difference = Array.new diff.each{|pair| new_difference.push({'attribute'=>pair[0], 'value'=>pair[1]}) } @difference = new_difference new_difference end ########################################################## # get attributes( , -, +) as one long string # def attributes @difference.collect{|element| element.attribute} end ########################################################## # # def values @difference.collect{|element| element.value} end ########################################################## # # def compare(text1, text2, separator = "\n", granularity = "word", morphoanalysis = false) # create cache identification string from method name and arguments. cache_id = ( 'compare' + text1 + text2 + separator + granularity + morphoanalysis.to_s ) # read cache if cache == 'on' if Cache.cache_exist?(cache_id, cache_dir) cache_data = Cache.read_cache(cache_id, cache_dir) @difference = cache_data return cache_data end end # call comparison sub-method, depending on granularity. result = [] case granularity when "word" result = compare_by_word(text1, text2, morphoanalysis) when "char" result = compare_by_char(text1, text2) when "line" result = compare_by_line(text1, text2, separator = "\n") else raise "wrong type of granularity: #{granularity}" end @difference = result # write cache if cache == 'on' Cache.prepare_cachedir(cache_dir) Cache.write_cache(result, cache_id, cache_dir) end return result end ########################################################## # # def compare_2step(text1, text2, separator = "\n", granularity = "word", morphoanalysis = false) # screening unchanged part by comparing by line first, then by word/char. diff_by_line = Difference.new d = Diff.new(@diff_cmd) diff_by_line.load_diff( d.compare( split_by_line(text1, separator), split_by_line(text2, separator) ) ) position = diff_by_line.scan_changed replaced = Difference.new position.each{|pos| # pos: [" ", [0, 0]] or ["!", [0, 0, 1, 1]] case pos[0] when " " replaced = replaced + diff_by_line[pos[1][0]..pos[1][1]] #p replaced when "!" case granularity when "word" replaced = replaced + compare_by_word( diff_by_line[pos[1][0]..pos[1][1]].collect{|e|e[1]}.to_s, diff_by_line[pos[1][2]..pos[1][3]].collect{|e|e[1]}.to_s, morphoanalysis ) #p replaced when "char" replaced = replaced + compare_by_char( diff_by_line[pos[1][0]..pos[1][1]].collect{|e|e[1]}.to_s, diff_by_line[pos[1][2]..pos[1][3]].collect{|e|e[1]}.to_s ) end # end case granularity else raise "mark other than ' ' or '!' found in scan_changed() retval." end # end case pos[0] } #exit @difference = replaced return replaced end ########################################################## # add diff_option and diff_cmdpath later # def compare_by_line(text1, text2, separator ="\n") result = [] d = Diff.new(@diff_cmd) result = import_diff( d.compare( split_by_line(text1, separator), split_by_line(text2, separator) ) ) return result end ########################################################## # # def compare_by_word(text1, text2, morphoanalysis = false) result = [] d = Diff.new(@diff_cmd) if morphoanalysis == true # add handling for diff_cmdpath, diff_option, chasen_cmdpath, chasen_option, later. result = import_diff( d.compare( split_by_morpheme(text1, guess_codeset(text1)), split_by_morpheme(text2, guess_codeset(text2)) ) ) else result = import_diff( d.compare( split_by_word_approximate(text1, guess_codeset(text1)), split_by_word_approximate(text2, guess_codeset(text2)) ) ) end return result end ########################################################## # # def compare_by_char(text1, text2) # add handling for diff_cmdpath, diff_option, later. result = [] d = Diff.new(@diff_cmd) result = import_diff( d.compare( split_by_char(text1, guess_codeset(text1)), split_by_char(text2, guess_codeset(text2)) ) ) return result end ########################################################## # Split string to lines # "Foo bar.\n\nBaz quux.\n" => ["Foo bar.","\n","\n","Baz quux.","\n"] # def split_by_line(text, separator) sep_save = $/ if separator != nil $/ = separator else $/ = "\n" end elements = Array(text).collect do |item| re_entailing_eol = Regexp::compile('.+' + $/) re_eol_itself = Regexp::compile($/) if re_entailing_eol =~ item # => if "foo$/" and not "$/" [item.sub(re_eol_itself, ''), $/] # "foo$/" => ["foo","$/"] else # => if "$/" [item] # "$/" => ["$/"] end end.flatten $/ = sep_save return elements end ########################################################## # Split string to words # def split_by_word(string, codeset = 'NONE', morphoanalysis = false) if morphoanalysis == true return split_by_morpheme(string, codeset) else return split_by_word_approximate(string, codeset) end end ########################################################## # Split string to words # (Japanese language does not have 'word', so this is approximate for # Japanese.) # Note: Keep consistency with split_by_morpheme. # def split_by_word_approximate(text, codeset = 'NONE') save = $KCODE $KCODE = codeset # ASCII alphabet and number ub_alnum = '(?:[0-9A-Za-z_\-])' # good-bye => good-bye # Symbols(excluding -) in ASCII text: # 0x20-0x2f( !"#$%&'()*+,-./), 0x3a-0x40(:;<=>?@), # 0x5b-0x5e([\]^), 0x60(`), 0x7b-0x7e({|}~) ub_symbol = '(?:(?:[\x20-\x2c])|(?:[\x2e-\x2f])|(?:[\x3a-\x40])|(?:[\x5b-\x5e])|(?:\x60)|(?:[\x7b-\x7e]))' # excluding '-'(0x2d) # ASCII control characters ub_control = '(?:[\x00-\x1f])' # EUC-ja unibyte katakana: 0x8e21-0x8e5f ub_kata = '(?:\x8e[\x21-\x5f])' # EUC-ja symbol (excluding macron("onbiki") and repeat("noma")): # 0xa1a1-0xa1b8, 0xa1ba-0xa1bb, 0xa1bd-0xa1fe, 0xa2a1-0xa2fe # macron is included in katakana, Noma in kanji. # ノマ:々 仮名返し:ヽヾ(ひらがな)ゝゞ(カタカナ) 図表:〃 mb_symbol = '(?:(?:\xa1[\xa1-\xb8\xba-\xbb\xbd-\xfe])|(?:\xa2[\xa1-\xfe]))' # EUC-ja multibyte alphabet and number: 0xa3b0-0xa3ff mb_alnum = '(?:\xa3[\xb0-\xff])' # EUC-ja 2-byte hiragana: 0xa4a1-0xa4fe mb_hira = '(?:\xa4[\xa1-\xfe])' # EUC-ja 2-byte katakana: 0xa5a1-0xa5fe, 0xa1bc(=macron) mb_kata = '(?:(?:\xa5[\xa1-\xfe])|(?:\xa1\xbc))' # EUC-ja Greek: 0xa6a1-0xa6fe mb_greek = '(?:\xa6[\xa1-\xfe])' # EUC-ja Cyrillic: 0xa7a1-0xa7fe mb_cyrillic = '(?:\xa7[\xa1-\xfe])' # EUC-ja box drawing symbol (=keisen): 0xa8a1-0xa8fe mb_boxdraw = '(?:\xa8[\xa1-\xfe])' # EUC-ja undefined: 0xa9a1-0xacfe, 0xaea1-0xaffe mb_undefined = '(?:(?:[\xa9-\xac][\xa1-\xfe])|(?:[\xae-\xaf][\xa1-\xfe]))' # EUC-ja NEC-only symbol: 0xada1-0xadfe mb_symbol_nec = '(?:\xad[\xa1-\xfe])' # EUC-ja kanji: 0xb0a1-0xfefe, 0xa1b9(="Kuma", 2nd symbol in "hitobito") # (actually includes undefined/NEC-kanji area) mb_kanji = '(?:(?:[\xb0-\xfe][\xa1-\xfe])|(?:\xa1\xb9))' pat_mb_kanhira = Regexp.new("^#{mb_kanji}+#{mb_hira}+") pat_mb_katahira = Regexp.new("^#{mb_kata}+#{mb_hira}+") pat_ub_alnum = Regexp.new("^#{ub_alnum}+") pat_ub_symbol = Regexp.new("^#{ub_symbol}") pat_ub_control = Regexp.new("^#{ub_control}") pat_ub_kata = Regexp.new("^#{ub_kata}+") pat_mb_symbol = Regexp.new("^#{mb_symbol}|#{mb_greek}|#{mb_cyrillic}|#{mb_boxdraw}|#{mb_symbol_nec}|#{mb_undefined}") # I know this is messy... pat_mb_alnum = Regexp.new("^#{mb_alnum}+") pat_mb_hira = Regexp.new("^#{mb_hira}+") pat_mb_kata = Regexp.new("^#{mb_kata}+") pat_mb_kanji = Regexp.new("^#{mb_kanji}+") result = [] Array(text).each do |string| while string.length > 0 case when pat_ub_alnum.match(string) result.push(Regexp.last_match[0]) string = Regexp.last_match.post_match when pat_ub_symbol.match(string) result.push(Regexp.last_match[0]) string = Regexp.last_match.post_match when pat_ub_control.match(string) result.push(Regexp.last_match[0]) string = Regexp.last_match.post_match when pat_ub_kata.match(string) result.push(Regexp.last_match[0]) string = Regexp.last_match.post_match when pat_mb_kanhira.match(string) # experimental result.push(Regexp.last_match[0]) string = Regexp.last_match.post_match when pat_mb_katahira.match(string) # experimental result.push(Regexp.last_match[0]) string = Regexp.last_match.post_match when pat_mb_alnum.match(string) result.push(Regexp.last_match[0]) string = Regexp.last_match.post_match when pat_mb_hira.match(string) result.push(Regexp.last_match[0]) string = Regexp.last_match.post_match when pat_mb_kata.match(string) result.push(Regexp.last_match[0]) string = Regexp.last_match.post_match when pat_mb_kanji.match(string) result.push(Regexp.last_match[0]) string = Regexp.last_match.post_match when pat_mb_symbol.match(string) result.push(Regexp.last_match[0]) string = Regexp.last_match.post_match else result.push(string[0..0]) string = string[1..-1] end # end case end # end while end # end do =begin # clean up above using RE list AL = "(^[A-Za-z]+)"; NUM = "(^[0-9]+)" RE_AL = Regexp.compile(AL); RE_NUM = Regexp.compile(NUM) RE_LIST = [RE_AL, RE_NUM] string = "I'm 10-years old. " * 150 result = [] t = Time.now while string.length > 0 matched_any = false RE_LIST.each{|re| if md = re.match(string) result.push(md[0]) string = md.post_match matched_any = true break # break each end } if matched_any == false result.push(string[0..0]) string = string[1..-1] end end =end # " ", "foo" => " foo" result.each_with_index do |item, i| if (result[i-1] == " ") && (result[i] != " ") result[i] = result[i-1] + result[i] result.delete_at(i-1) end end $KCODE = save return result end ########################################################## # Split string to morpheme # def split_by_morpheme(string, codeset = 'NONE') if codeset.upcase != 'NONE' require 'jcode' end save = $KCODE $KCODE = codeset result = chasen(string) $KCODE = save return result end ########################################################## # Split string to characters # def split_by_char(string, codeset = 'NONE') if codeset.upcase != 'NONE' require 'jcode' end save = $KCODE $KCODE = codeset result = string.split(//) $KCODE = save return result end ########################################################## # Guess codeset of given string # def guess_codeset(string) #require 'nkf' codes = { NKF::JIS => "JIS", NKF::EUC => "EUC", NKF::SJIS => "SJIS", NKF::BINARY => "BINARY", NKF::UNKNOWN => "UNKNOWN(ASCII)" } codes[NKF::guess(string)] end ########################################################## # End-of-line character detecter # Returns 'CR', 'LF', 'CRLF', 'NONE', 'UNKNOWN', or nil. # def guess_eol(string, sample_length = 4096) if string == nil # string is empty return nil end sample = string[0 .. (sample_length - 1)] # Read less to gain speed. eol_stat = { # count of each EOL character. 'CR' => sample.scan(/(\r)(?!\n)/).size, 'LF' => sample.scan(/[^\r](\n)/).size, 'CRLF' => sample.scan(/(\r\n)/).size } eol_stat.delete_if{|k,v| v == 0} # Remove EOL that was not found. eol_variety = eol_stat.keys.size # How many flavors found? if eol_variety == 1 # Only one type of EOL was found. return eol_stat.keys[0] # => 'CR', 'LF', or 'CRLF' elsif eol_variety == 0 # No EOL found. Might be 1-line file. return 'NONE' else # Multiple types of EOL found. (Maybe binary data.) # sorted_keys = eol_stat.keys.sort {|a,b|eol_stat[b] <=> eol_stat[a]} # tmp = sorted_keys.collect {|k| [k, eol_stat[k]]}.join(' ') # return "BIN (#{tmp})" # This is for debugging. return 'UNKNOWN' end end ########################################################## # Collect succeeding elements with same attribute, and put them together. # [[" ", "foo"], [" ", "bar"]] => [[" ", "foobar"],..] # Note: "\n" is always treated as an independent element. # def group_simple grouped = [] grouped.push(@difference[0]) for i in (1 .. (@difference.size - 1)) if (@difference[i].attribute == @difference[i - 1].attribute) && (difference[i].value != "\n") last_element = grouped.pop grouped.push( { 'attribute'=>last_element.attribute, 'value'=>(last_element.value + @difference[i].value) } ) else # [..,[" ", "foobar"]] << ["+", "baz"] grouped.push(@difference[i]) end end return grouped end ########################################################## # Format difference data using HTML # def format_html(specified_tags={}, title='', codeset='') # default HTML tags tags = { 'doc_type' => '', 'doc_begin' => '', 'doc_end' => '', 'head_begin' => '', 'head_end' => '', 'meta' => '', 'title_begin' => '', 'title_end' => '', 'body_begin' => '', 'body_end' => '', 'remove_begin' => '', 'remove_end' => '', 'add_begin' => '', 'add_end' => '', 'end_of_line' => '
' } case codeset when 'EUC' tags['meta'] = '' when 'JIS' tags['meta'] = '' when 'SJIS' tags['meta'] = '' when 'ASCII' tags['meta'] = '' else tags['meta'] = '' end # merge user-specified tags into pre-defined default tags. tags.update(specified_tags) header = ( "#{tags['doc_type']}\n#{tags['doc_begin']}\n" + " #{tags['head_begin']}\n" + " #{tags['meta']}\n" + " #{tags['title_begin']}#{title}#{tags['title_end']}\n" + " #{tags['head_end']}\n" + " #{tags['body_begin']}\n" ) footer =( "\n" + " #{tags['body_end']}\n" + "#{tags['doc_end']}\n" ) body = group_for_html.collect do |elem| case elem.attribute when ' ' escape_html(elem.value).gsub(/\n/, tags['end_of_line']) when '-' (tags['remove_begin'] + escape_html(elem.value).gsub(/\n/, tags['end_of_line']) + tags['remove_end']) when '+' (tags['add_begin'] + escape_html(elem.value).gsub(/\n/, tags['end_of_line']) + tags['add_end']) else escape_html(elem.value).gsub(/\n/, tags['end_of_line']) end end body = body.to_s result = header + body + footer return result end ########################################################## # Group "difference" for HTML output # def group_for_html group_simple end ########################################################## # examine "difference" and return position info about changed part (-+) # def scan_changed attr_str = attributes.to_s #=> " ", "-", "+", ... #=>" -+..." cursor = 0 result = [] while attr_str.length > cursor if /\-+\++/.match(attr_str[cursor..-1]) # record the stuff between the last and curent match if $~.pre_match != '' # consider '!' at beginning result.push( [" ", $~.pre_match, [cursor, (cursor + $~.pre_match.length - 1)] ] ) #=> [" ", " - + ", [0, 5]] # move the cursor to the beginning of the match cursor = cursor + $~.pre_match.length end minus_length = $~[0].tr('+',"").size #=> "--".length plus_length = $~[0].tr('-',"").size #=> "+".length # record the current matched result.push( ["!", $~[0], [cursor, (cursor + minus_length - 1), (cursor + minus_length), (cursor + minus_length + plus_length - 1) ] ] ) #=> ["!", "--+", [6, 7, 8, 8]] # move the cursor to the point just after the match cursor = cursor + $~[0].length else # no more "-+"s, so record the rest result.push( [" ", attr_str[cursor..-1], [cursor, (attr_str.length - 1)] ] ) #=> [" ", " -", [14, 15]] break end end # at this point: # [[" ", " - + ", [0, 5]], # ["!", "--+", [6, 7, 8, 8]], # [" ", " ", [9, 9]], # ["!", "-+++", [10, 10, 11, 13]], # [" ", " -", [14, 15]]] # cut off excessive info result.collect! do |item| [item[0], item[2]] end return result # result will look like: # [[" ", [0, 5]], # ["!", [6, 7, 8, 8]], # [" ", [9, 9]], # ["!", [10, 10, 11, 13]], # [" ", [14, 15]]] end ########################################################## # Escape HTML special characters ('<' => '<') # def escape_html(string) #require 'cgi' return CGI::escapeHTML(string) end ########################################################## # Unescape HTML special characters ('<' => '<') # def unescape_html(string) #require 'cgi'; return CGI::unescapeHTML(string) end ########################################################## # Format difference data using escape sequence # title is just ignored in format_escape(). # def format_escape(specified_tags={}, title='no title', codeset='') # default escape sequence, or 'tag' tags = { 'remove_begin' => "\033[7m", # xor on 'remove_end' => "\033[27m", # xor off 'add_begin' => "\033[4m", # underline on 'add_end' => "\033[24m", # underline off 'end_of_line' => "\n" } # merge user-specified tags into pre-defined default tags. tags.update(specified_tags) body = group_for_escape.collect do |elem| case elem.attribute when ' ' elem.value.gsub(/\n/, tags['end_of_line']) when '-' (tags['remove_begin'] + elem.value.gsub(/\n/, tags['end_of_line']) + tags['remove_end']) when '+' (tags['add_begin'] + elem.value.gsub(/\n/, tags['end_of_line']) + tags['add_end']) else elem.value.gsub(/\n/, tags['end_of_line']) end end return body.to_s end ########################################################## # # def group_for_escape group_simple end ########################################################## # # def format_debug @difference.inspect end =begin not implemented yet ########################################################## # # def format_manued end ########################################################## # # def group_for_manued group_complex end ########################################################## # # def group_complex end ########################################################## # # def format_xhtml # end ########################################################## # # def group_for_xhtml group_simple end =end not implemented yet ########################################################## # below are the methods to be used by application. ########################################################## ########################################################## # version, copyright # def version "#{APPLICATION_NAME} #{DOCDIFF_VERSION}\n#{DOCDIFF_COPYRIGHT}" end ########################################################## # usage information # def usage return <<-EndOfMessage.gsub(/^ /,'') Usage: #{File.basename($0)} [OPTION..] FILE1 FILE2 OPTION: -g --granularity UNIT Set comparison granularity. UNIT: word word (default) char character line line -m --morphoanalysis MODE Set morphoanalysis mode. MODE: auto on if available (default) on force on off force off -f --format FORMAT Specify output format. FORMAT: html HTML (default) xhtml XHTML esc escape sequence manued Manued debug Ruby object dump --tag TAGS Set mark-up tags. --diff PATH_TO_DIFF Specify diff command. --chasen PATH_TO_CHASEN Specify chasen command. --rc RC_FILE Specify DocDiff configuration file. --chasenrc CHASEN_RC Specify ChaSen configuration file. --cache MODE Enable/disable cache. MODE: on enable cache (default) off disable cache --cachedir CACHE_DIR Specify cache directory. -h --help Output help message. -v --version Output version information. EndOfMessage end end # end of class DocDiff ############################################################ # DocDiff Application part ############################################################ if $0 == __FILE__ # GC.disable # not much performance gain (3.7sec->3.4sec). hmm. docdiff = DocDiff.new if ARGV == [] $stderr.print(docdiff.usage) exit end ########################################################## # process command line option # require 'getoptlong' getoptlong = GetoptLong.new( ['--granularity', '-g', GetoptLong::REQUIRED_ARGUMENT], ['--morphoanalysis', '-m', GetoptLong::REQUIRED_ARGUMENT], ['--format', '-f', GetoptLong::REQUIRED_ARGUMENT], ['--tag', GetoptLong::REQUIRED_ARGUMENT], ['--diff', GetoptLong::REQUIRED_ARGUMENT], ['--chasen', GetoptLong::REQUIRED_ARGUMENT], ['--rc', GetoptLong::REQUIRED_ARGUMENT], ['--chasenrc', GetoptLong::REQUIRED_ARGUMENT], ['--cache', GetoptLong::REQUIRED_ARGUMENT], ['--cachedir', GetoptLong::REQUIRED_ARGUMENT], ['--help', '-h', GetoptLong::NO_ARGUMENT], ['--version', '-v', GetoptLong::NO_ARGUMENT] ) conf_cmdline = Hash.new begin getoptlong.each do |optname, optarg| case optname when '--granularity'; conf_cmdline['granularity'] = optarg when '--morphoanalysis'; conf_cmdline['morphoanalysis'] = optarg when '--format'; conf_cmdline['output_format'] = optarg when '--tag'; conf_cmdline['tag'] = optarg when '--diff'; conf_cmdline['diff_cmd'] = optarg when '--chasen'; conf_cmdline['chasen_cmd'] = optarg when '--rc'; conf_cmdline['rc_file'] = optarg when '--chasenrc'; conf_cmdline['chasen_rc'] = optarg when '--cache'; conf_cmdline['cache'] = optarg when '--cachedir'; conf_cmdline['cache_dir'] = optarg when '--help'; # $stderr.print(docdiff.usage) # you want "docdiff.rb -h | more", huh? print(docdiff.usage) exit when '--version' # $stderr.print("#{docdiff.version}\n") print("#{docdiff.version}\n") exit end end rescue $stderr.print(docdiff.usage) exit(1) end ########################################################## # read configuration from rc file # conf_rc = Hash.new module Conf # guarantee that constant 'Conf' exists even when no rc file is loaded. end if conf_cmdline['rc_file'] != nil # if rcfile is specified on command line, if FileTest.exist?(conf_cmdline['rc_file']) load conf_cmdline['rc_file'] # load specified config file. else raise "Failed to read .docdiffrc file: #{conf_cmdline['rc_file']}\n" end elsif FileTest.exist?('~/.docdiffrc') load '~/.docdiffrc' # else # $stderr.print "No .docdiffrc found. Default configuration is used.\n" end conf_rc['granularity'] = Conf::GRANULARITY if Conf::const_defined?(:GRANULARITY) conf_rc['morphoanalysis'] = Conf::MORPHOANALYSIS if Conf::const_defined?(:MORPHOANALYSIS) conf_rc['output_format'] = Conf::OUTPUT_FORMAT if Conf::const_defined?(:OUTPUT_FORMAT) conf_rc['tag'] = Conf::TAG if Conf::const_defined?(:TAG) conf_rc['diff_cmd'] = Conf::DIFF_CMD if Conf::const_defined?(:DIFF_CMD) conf_rc['chasen_cmd'] = Conf::CHASEN_CMD if Conf::const_defined?(:CHASEN_CMD) conf_rc['chasen_rc'] = Conf::CHASEN_RC if Conf::const_defined?(:CHASEN_RC) conf_rc['cache'] = Conf::CACHE if Conf::const_defined?(:CACHE) conf_rc['cache_dir'] = Conf::CACHE_DIR if Conf::const_defined?(:CACHE_DIR) #conf_rc = conf_rc.delete_if {|k, v| v == nil} # obsolete debug code ########################################################## # set default configuration # conf_default = Hash.new conf_default['granularity'] = 'word' conf_default['morphoanalysis'] = 'auto' conf_default['output_format'] = 'html' conf_default['diff_cmd'] = 'diff' conf_default['chasen_cmd'] = 'chasen' conf_default['rc_file'] = '~/.docdiffrc' conf_default['chasen_rc'] = '~/.chasenrc.docdiff' conf_default['cache'] = 'on' conf_default['cache_dir'] = Cache.default_cachedir ########################################################## # merge command line option to config read from file (overwrite) # conf_runtime = Hash.new conf_runtime.update(conf_default) conf_runtime.update(conf_rc) # overwrite with rc file conf_runtime.update(conf_cmdline) # overwrite with command line option # morphoanalysis auto -> on or off case conf_runtime['morphoanalysis'] when 'on' conf_runtime['morphoanalysis'] = true when 'off' conf_runtime['morphoanalysis'] = false when 'auto' if FileTest.executable?(conf_runtime['chasen_cmd']) && (conf_runtime['chasen_rc'] != nil) conf_runtime['morphoanalysis'] = true else conf_runtime['morphoanalysis'] = false end else raise "unsupported morphoanalysis value:#{conf_runtime['morphoanalysis'].inspect}\n" end # p conf_runtime; exit # for debug ########################################################## # apply configuration # docdiff.granularity = conf_runtime['granularity'] docdiff.morphoanalysis = conf_runtime['morphoanalysis'] docdiff.output_format = conf_runtime['output_format'] docdiff.tag = conf_runtime['tag'] docdiff.diff_cmd = conf_runtime['diff_cmd'] docdiff.chasen_cmd = conf_runtime['chasen_cmd'] docdiff.chasen_rc = conf_runtime['chasen_rc'] docdiff.cache = conf_runtime['cache'] docdiff.cache_dir = conf_runtime['cache_dir'] ########################################################## # read documents, compare, then output the difference # doc = [] ARGV.each_with_index do |arg, i| doc[i] = File.open(arg, 'rb').read end docdiff.compare(doc[0], doc[1], "\n", docdiff.granularity, docdiff.morphoanalysis) case docdiff.output_format when 'html' print docdiff.format_html({}, "#{ARGV[0]} (#{File.stat(ARGV[0]).mtime}), #{ARGV[1]} (#{File.stat(ARGV[1]).mtime})", '') when 'esc' print docdiff.format_escape when 'debug' print docdiff.format_debug end end # end if $0 == __FILE__