Source: classifiers/DocumentClassifier.js

/**
 * The Bravey document classifier, based on Naive Bayes.
 * @constructor
 * @param {string} [extensions.stemmer] - A stemmer instance to be used for classifying.
 */
Bravey.DocumentClassifier = function(extensions) {
  extensions = extensions || {};
  var storage = {};

  var stemKey = function(stem, label) {
    return 'stem:' + stem + '::label:' + label;
  };
  var docCountKey = function(label) {
    return 'docCount:' + label;
  };
  var stemCountKey = function(stem) {
    return 'stemCount:' + stem;
  };

  var log = function(text) {
    //console.log(text);
  };

  var getLabels = function() {
    var labels = storage['registeredLabels'];
    if (!labels) labels = '';
    return labels.split(',').filter(function(a) {
      return a.length;
    });
  };

  var registerLabel = function(label) {
    var labels = getLabels();
    if (labels.indexOf(label) === -1) {
      labels.push(label);
      storage['registeredLabels'] = labels.join(',');
    }
    return true;
  };

  var stemLabelCount = function(stem, label) {
    var count = parseInt(storage[stemKey(stem, label)]);
    if (!count) count = 0;
    return count;
  };
  var stemInverseLabelCount = function(stem, label) {
    var labels = getLabels();
    var total = 0;
    for (var i = 0, length = labels.length; i < length; i++) {
      if (labels[i] === label)
        continue;
      total += parseInt(stemLabelCount(stem, labels[i]));
    }
    return total;
  };

  var stemTotalCount = function(stem) {
    var count = parseInt(storage[stemCountKey(stem)]);
    if (!count) count = 0;
    return count;
  };
  var docCount = function(label) {
    var count = parseInt(storage[docCountKey(label)]);
    if (!count) count = 0;
    return count;
  };
  var docInverseCount = function(label) {
    var labels = getLabels();
    var total = 0;
    for (var i = 0, length = labels.length; i < length; i++) {
      if (labels[i] === label)
        continue;
      total += parseInt(docCount(labels[i]));
    }
    return total;
  };
  var increment = function(key) {
    var count = parseInt(storage[key]);
    if (!count) count = 0;
    storage[key] = parseInt(count) + 1;
    return count + 1;
  };

  var incrementStem = function(stem, label) {
    increment(stemCountKey(stem));
    increment(stemKey(stem, label));
  };

  var incrementDocCount = function(label) {
    return increment(docCountKey(label));
  };

  var train = function(text, label) {
    registerLabel(label);
    var words = Bravey.Text.tokenize(Bravey.Text.clean(text));
    if (extensions.filter) words = extensions.filter(words);
    var length = words.length;
    for (var i = 0; i < length; i++)
      incrementStem(extensions.stemmer ? extensions.stemmer(words[i]) : words[i], label);
    incrementDocCount(label);
  };

  var guess = function(text) {
    var words = Bravey.Text.tokenize(Bravey.Text.clean(text));
    if (extensions.filter) words = extensions.filter(words);

    var length = words.length;
    var labels = getLabels();
    var totalDocCount = 0;
    var docCounts = {};
    var docInverseCounts = {};
    var scores = {};
    var labelProbability = {};

    for (var j = 0; j < labels.length; j++) {
      var label = labels[j];
      docCounts[label] = docCount(label);
      docInverseCounts[label] = docInverseCount(label);
      totalDocCount += parseInt(docCounts[label]);
    }

    for (var j = 0; j < labels.length; j++) {
      var label = labels[j];
      var logSum = 0;
      labelProbability[label] = docCounts[label] / totalDocCount;

      for (var i = 0; i < length; i++) {
        var word = extensions.stemmer ? extensions.stemmer(words[i]) : words[i];
        var _stemTotalCount = stemTotalCount(word);
        if (_stemTotalCount === 0) {
          continue;
        } else {
          var wordProbability = stemLabelCount(word, label) / docCounts[label];
          var wordInverseProbability = stemInverseLabelCount(word, label) / docInverseCounts[label];
          var wordicity = wordProbability / (wordProbability + wordInverseProbability);
          wordicity = ((1 * 0.5) + (_stemTotalCount * wordicity)) / (1 + _stemTotalCount);
          if (wordicity === 0)
            wordicity = 0.01;
          else if (wordicity === 1)
            wordicity = 0.99;
        }

        logSum += (Math.log(1 - wordicity) - Math.log(wordicity));
        log(label + "icity of " + word + ": " + wordicity);
      }
      scores[label] = 1 / (1 + Math.exp(logSum));
    }
    return scores;
  };

  var extractWinner = function(scores) {
    var bestScore = 0;
    var bestLabel = null;
    for (var label in scores) {
      if (scores[label] > bestScore) {
        bestScore = scores[label];
        bestLabel = label;
      }
    }
    return {
      label: bestLabel,
      score: bestScore
    };
  };

  /**
   * Add a document to the classifier.
   * @param {string} text - The text to be classified.
   * @param {string} label - The related label
   * @returns {text} The classified text.
   */
  this.addDocument = function(text, label) {
    train(text, label);
    return text;
  }

  /**
   * Classify a document.
   * @param {string} text - The document to be classified.
   * @returns {DocumentClassification} The document class.
   */
  this.classifyDocument = function(text) {
    var scores = guess(text);
    var winner = extractWinner(scores);
    return {
      scores: scores,
      winner: winner
    };
  }

  this.addDocument("", "none");

}

/**
 Describes a document classification.
 @typedef DocumentClassification
 @type {Object}
 @property {number[]} scores The related scores for each known document label.
 @property {number} winner.score The score of the winning label.
 @property {string} winner.label The name of the winning label.
*/