Jump to content

User:Cacycle/diff.js

From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Cacycle (talk | contribs) at 22:53, 28 January 2006 (new diff wetecting block moves). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.
(diff) ← Previous revision | Latest revision (diff) | Newer revision → (diff)
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
/*<pre><nowiki> */


/*
 * JavaScript diff algorithm
 * by [[User:Cacycle]] (en.wikipedia.org)
 *
 * An implementation of:
 * Paul Heckel: A technique for isolating differences between files
 * Communications of the ACM 21(4):264 (1978)
 * http://doi.acm.org/10.1145/359460.359467
 *
 */

function diff(oldText, newText) {

  var str = '';

// text object holds all text datastructures
  var text = new Object();

// consecutive words of the new text (N)
  text.newWords = new Array();

// consecutive words of the old text (O)
  text.oldWords = new Array();

// array of corresponding word number in old text (NA): text.newToOld
  text.newToOld = new Array();

// array of corresponding word number in new text (OA): text.oldToNew
  text.oldToNew = new Array();


// symbol table for passes 1 - 3  holds words as a hash: symbol['word']
  symbol = new Object();

// symbol table new word occurences counter (NC): symbol.newCtr
  symbol.newCtr = new Array();

// symbol table old word occurences counter (OC): symbol.oldCtr
  symbol.oldCtr = new Array();

// symbol table last old word number: symbol.toNew
  symbol.toNew = new Array();

// symbol table last new word number (OLNA): symbol.toOld
  symbol.toOld = new Array();


// split old text into words
  do {
//               /     |    |    |    |    |   |  |     |   |  |  |    |    |    | /
    var result = /[\w]+|\[\[|\]\]|\{\{|\}\}|\n+| +|&\w+;|'''|''|=+|\{\||\|\}|\|\-|./g.exec(oldText);
    if (result != null) {
      text.oldWords.push(result[0]);
    }
  } while (result != null);

// split new text into words ///VAR???
  do {
    var result = /[\w]+|\[\[|\]\]|\{\{|\}\}|\n+| +|&\w+;|'''|''|=+|\{\||\|\}|\|\-|./g.exec(newText);
    if (result != null) {
      text.newWords.push(result[0]);
    }
  } while (result != null);

// pass 1: parse new text into symbol table s

  for (var i = 0; i < text.newWords.length; i ++) {
    var word = text.newWords[i];

// add new entry to symbol table
    if ( symbol[word] == null) {
      symbol[word] = { newCtr: 0, oldCtr: 0, toNew: null, toOld: null };
    }

// increment symbol table word counter for new text
    symbol[word].newCtr ++;

// add last word number in new text
    symbol[word].toNew = i;
  }

// pass 2: parse old text into symbol table

  for (var i = 0; i < text.oldWords.length; i ++) {
    var word = text.oldWords[i];

// add new entry to symbol table
    if ( symbol[word] == null) {
      symbol[word] = { newCtr: 0, oldCtr: 0, toNew: null, toOld: null };
    }

// increment symbol table word counter for old text
    symbol[word].oldCtr ++;

// add last word number in old text
    symbol[word].toOld = i;
  }

// pass 3: connect unique words

  for (var i in symbol) {

// find words in the symbol table that occur only once in both versions
    if ( (symbol[i].newCtr == 1) && (symbol[i].oldCtr == 1) ) {
      var toNew = symbol[i].toNew;
      var toOld = symbol[i].toOld;

// do not use spaces and non-words as unique markers
      if ( /\w/.test( text.newWords[toNew] ) ) {

// connect from n to o
        text.newToOld[toNew] = toOld;

// connect from o to n
        text.oldToNew[toOld] = toNew;
      }
    }
  }

// pass 4: connect following identical words downwards

  text = pass4(text);
  function pass4(text) {
    for (var i = 0; i < text.newWords.length - 1; i ++) {

// find already connected pairs
      if (text.newToOld[i] != null) {
        var j = text.newToOld[i];

// check if the following words are not yet connected
        if ( (text.newToOld[i + 1] == null) && (text.oldToNew[j + 1] == null) ) {

// if the following words are the same connect them
          if ( text.newWords[i + 1] == text.oldWords[j + 1] ) {
            text.newToOld[i + 1] = j + 1;
            text.oldToNew[j + 1] = i + 1;
          }
        }
      }
    }
    return (text);
  }

// pass 5: connect following identical words upwards

  text = pass5(text);
  function pass5(text) {
    for (var i = text.newWords.length - 1; i > 0; i --) {

// find already connected pairs
      if (text.newToOld[i] != null) {
        var j = text.newToOld[i];

// check if the preceeding words are not yet connected
        if ( (text.newToOld[i - 1] == null) && (text.oldToNew[j - 1] == null) ) {

// if the preceeding words are the same connect them
          if ( text.newWords[i - 1] == text.oldWords[j - 1] ) {
            text.newToOld[i - 1] = j - 1;
            text.oldToNew[j - 1] = i - 1;
          }
        }
      }
    }
    return (text);
  }

// unresolved regions caused by adding two common words into sequences of common words
// are resolved by pairwise comparison of the words
  for (var i = 0; i < text.newWords.length - 1; i ++) {

// find already connected pairs
    if (text.newToOld[i] != null) {
      var j = text.newToOld[i];

// find corresponding unresolved sequences
      if ( (text.newToOld[i + 1] == null) && (text.oldToNew[j + 1] == null) ) {

// determine the end of the sequences
        var ib = i + 2;
        while ( (text.newToOld[ib] == null) && (ib < text.newWords.length) ) {
          ib ++;
        }
        var jb = j + 2;
        while ( (text.oldToNew[jb] == null) && (jb < text.oldWords.length) ) {
          jb ++;
        }

// compare wordwise, exclude spaces
        for (i2 = i + 1; i2 < ib; i2 ++) {
          if ( /\s/.test( text.newWords[i2] ) ) {
            continue;
          }
          for (j2 = j + 1; j2 < jb; j2 ++) {
            if (text.oldToNew[j2] != null) {
              continue;
            }
            if ( text.newWords[i2] == text.oldWords[j2] ) {
              text.newToOld[i2] = j2;
              text.oldToNew[j2] = i2;
              break;
            }
          }
        }
      }
    }
  }

// repeat pass 4: connect following identical spaces downwards
  text = pass4(text);

// repeat pass 5: connect following identical spaces upwards
  text = pass5(text);

// pass 6: generate output

  str += '<span class="diffBlock0">';
  var block = 1;

  var i = 0;
  var j = 0;
  do {

// collect consecutive identical text
    var ident = '';
    while ( (text.newToOld[i] != null) && (i < text.newWords.length) ) {

// detect block boundary
      if ( (text.newToOld[i] != j) && (text.newToOld[i] != null) ) {
        j = text.newToOld[i];
        ident += '</span><span class="diffBlock' + block + '">';
        block = 1 - block;
      }

      ident += escape(text.newWords[i]);
      i ++;
      j ++;
    }

// collect consecutive deletions
    var del = '';
    while ( (text.oldToNew[j] == null) && (j < text.oldWords.length) ) {
      del += text.oldWords[j];
      j ++;
    }

// collect consecutive inserts
    var ins = '';
    while ( (text.newToOld[i] == null) && (i < text.newWords.length) ) {
      ins += text.newWords[i];
      i ++;
    }

// remove leading and trailing similarities betweein del and ins from highlighting
    var pre = '';
    var post = '';
    if ( (del != '') && (ins != '') ) {

// remove leading similarities
      while ( del.charAt(0) == ins.charAt(0) ) {
        pre = pre + del.charAt(0);
        del = del.substr(1);
        ins = ins.substr(1);
      }

// remove trailing similarities
      while ( del.charAt(del.length - 1) == ins.charAt(ins.length - 1) ) {
        post = del.charAt(del.length - 1) + post;
        del = del.substr(0, del.length - 1);
        ins = ins.substr(0, ins.length - 1);
      }
    }

// output the identical text, deletions and inserts
    str += ident;
    str += pre;
    if (del != '') {
      del = '<font class="diffDelete">' + escape(del) + "</font>";
      del = del.replace(/\n/g, '&para;<br>');
      str += del;
    }
    if (ins != '') {
      ins = '<font class="diffInsert">' + escape(ins) + "</font>";
      ins = ins.replace(/\n/g, '&para;<br>');
      str += ins;
    }
    str += post;

  } while (i < text.newWords.length);

  str += '</span>';
  str = str.replace(/  /g, ' &nbsp;');
  str = str.replace(/\n/g, '<br>');
  return(str);
}


function escape(str) {
  str = str.replace(/&/g, "&amp;");
  str = str.replace(/</g, "&lt;");
  str = str.replace(/>/g, "&gt;");
  str = str.replace(/"/g, "&quot;"); //"
  return (str);
}


/* </nowiki></pre> */