d3-spring-model/src/t-sne.js

import constant from "./constant";

/**
 * Set the node id accessor to the specified i.
 * @param  {node} d - node.
 * @param  {accessor} i - id accessor.
 * @return {accessor}  - node id accessor.
 */
function index(d, i) {
  return i;
}

/**
 * t-SNE implementation in D3 by using the code existing in tsnejs
 * (https://github.com/karpathy/tsnejs) to compute the solution.
 */
export default function() {
  var id = index,
    distance = constant(300),
    nodes,
    perplexity = 30,
    learningRate = 10,
    iteration = 0,
    dim = 2,
    N, // length of the nodes.
    P, // probability matrix.
    Y, // solution.
    gains,
    ystep;

  /**
   * Make a step in t-SNE algorithm and set the velocities for the nodes
   * to accumulate the values from solution.
   */
  function force() {
    // Make a step at each iteration.
    step();
    var solution = getSolution();

    // Set the velocity for each node using the solution.
    for (var i = 0; i < nodes.length; i++) {
      nodes[i].vx += solution[i][0];
      nodes[i].vy += solution[i][1];
    }
  }

  /**
   * Calculates the random number from Gaussian distribution.
   * @return {number} random number.
   */
  function gaussRandom() {
    let u = 2 * Math.random() - 1;
    let v = 2 * Math.random() - 1;
    let r = u * u + v * v;
    if (r == 0 || r > 1) return gaussRandom();
    return u * Math.sqrt(-2 * Math.log(r) / r);
  }

  /**
   * Return the normalized number.
   * @return {number} normalized random number from Gaussian distribution.
   */
  function randomN() {
    return gaussRandom() * 1e-4;
  }

  function sign(x) {
    return x > 0 ? 1 : x < 0 ? -1 : 0;
  }

  /**
   * Create an array of length n filled with zeros.
   * @param  {number} n - length of array.
   * @return {Float64Array}   - array of zeros with length n.
   */
  function zeros(n) {
    if (typeof(n) === 'undefined' || isNaN(n)) {
      return [];
    }
    return new Float64Array(n); // typed arrays are faster
  }

  // Returns a 2d array of random numbers
  /**
   * Creates a 2d array filled with random numbers.
   * @param  {number} n - rows.
   * @param  {number} d - columns.
   * @return {array}   - 2d array
   */
  function random2d(n, d) {
    var x = [];
    for (var i = 0; i < n; i++) {
      var y = [];
      for (var j = 0; j < d; j++) {
        y.push(randomN());
      }
      x.push(y);
    }
    return x;
  }

  /**
   * Compute the probability matrix using the provided data.
   * @param  {array} data       - nodes.
   * @param  {number} perplexity - used to calculate entropy of distribution.
   * @param  {number} tol        - limit for entropy difference.
   * @return {2d array}          - 2d matrix containing probabilities.
   */
  function d2p(data, perplexity, tol) {
    N = Math.floor(data.length);
    var Htarget = Math.log(perplexity); // target entropy of distribution.
    var P1 = zeros(N * N); // temporary probability matrix.

    var prow = zeros(N); // a temporary storage compartment.
    for (var i = 0; i < N; i++) {
      var betamin = -Infinity;
      var betamax = Infinity;
      var beta = 1; // initial value of precision.
      var done = false;
      var maxtries = 50;

      // Perform binary search to find a suitable precision beta
      // so that the entropy of the distribution is appropriate.
      var num = 0;
      while (!done) {
        // Compute entropy and kernel row with beta precision.
        var psum = 0.0;
        for (var j = 0; j < N; j++) {
          var pj = Math.exp(-distance(data[i], data[j]) * beta);
          // Ignore the diagonals
          if (i === j) {
            pj = 0;
          }
          prow[j] = pj;
          psum += pj;
        }
        // Normalize p and compute entropy.
        var Hhere = 0.0;
        for (j = 0; j < N; j++) {
          if (psum == 0) {
            pj = 0;
          } else {
            pj = prow[j] / psum;
          }
          prow[j] = pj;
          if (pj > 1e-7) {
            Hhere -= pj * Math.log(pj);
          }
        }

        // Adjust beta based on result.
        if (Hhere > Htarget) {
          // Entropy was too high (distribution too diffuse)
          // so we need to increase the precision for more peaky distribution.
          betamin = beta; // move up the bounds.
          if (betamax === Infinity) {
            beta = beta * 2;
          } else {
            beta = (beta + betamax) / 2;
          }

        } else {
          // Converse case. Make distrubtion less peaky.
          betamax = beta;
          if (betamin === -Infinity) {
            beta = beta / 2;
          } else {
            beta = (beta + betamin) / 2;
          }
        }

        // Stopping conditions: too many tries or got a good precision.
        num++;
        if (Math.abs(Hhere - Htarget) < tol || num >= maxtries) {
          done = true;
        }
      }

      // Copy over the final prow to P1 at row i
      for (j = 0; j < N; j++) {
        P1[i * N + j] = prow[j];
      }

    }

    // Symmetrize P and normalize it to sum to 1 over all ij
    var Pout = zeros(N * N);
    var N2 = N * 2;
    for (i = 0; i < N; i++) {
      for (j = 0; j < N; j++) {
        Pout[i * N + j] = Math.max((P1[i * N + j] + P1[j * N + i]) / N2, 1e-100);
      }
    }
    return Pout;
  }

  /**
   * Initialize a starting (random) solution.
   */
  function initSolution() {
    Y = random2d(N, dim);
    // Step gains to accelerate progress in unchanging directions.
    gains = random2d(N, dim, 1.0);
    // Momentum accumulator.
    ystep = random2d(N, dim, 0.0);
    iteration = 0;
  }

  /**
   * @return {2d array} the solution.
   */
  function getSolution() {
    return Y;
  }

  /**
   * Do a single step (iteration) for the layout.
   * @return {number} the current cost.
   */
  function step() {
    iteration += 1;

    var cg = costGrad(Y); // Evaluate gradient.
    var cost = cg.cost;
    var grad = cg.grad;

    // Perform gradient step.
    var ymean = zeros(dim);
    for (var i = 0; i < N; i++) {
      for (var d = 0; d < dim; d++) {
        var gid = grad[i][d];
        var sid = ystep[i][d];
        var gainid = gains[i][d];

        // Compute gain update.
        var newgain = sign(gid) === sign(sid) ? gainid * 0.8 : gainid + 0.2;
        if (newgain < 0.01) {
          newgain = 0.01;
        }
        gains[i][d] = newgain;

        // Compute momentum step direction.
        var momval = iteration < 250 ? 0.5 : 0.8;
        var newsid = momval * sid - learningRate * newgain * grad[i][d];
        ystep[i][d] = newsid;

        // Do the step.
        Y[i][d] += newsid;

        // Accumulate mean so that we can center later.
        ymean[d] += Y[i][d];
      }
    }

    // Reproject Y to have the zero mean.
    for (i = 0; i < N; i++) {
      for (d = 0; d < dim; d++) {
        Y[i][d] -= ymean[d] / N;
      }
    }
    return cost;
  }

  /**
   * Calculate the cost and the gradient.
   * @param  {2d array} Y - the current solution to evaluate.
   * @return {object}   that contains a cost and a gradient.
   */
  function costGrad(Y) {

    var pmul = iteration < 100 ? 4 : 1;

    // Compute current Q distribution, unnormalized first.
    var Qu = zeros(N * N);
    var qsum = 0.0;
    for (var i = 0; i < N; i++) {
      for (var j = i + 1; j < N; j++) {
        var dsum = 0.0;
        for (var d = 0; d < dim; d++) {
          var dhere = Y[i][d] - Y[j][d];
          dsum += dhere * dhere;
        }
        var qu = 1.0 / (1.0 + dsum); // Student t-distribution.
        Qu[i * N + j] = qu;
        Qu[j * N + i] = qu;
        qsum += 2 * qu;
      }
    }
    // Normalize Q distribution to sum to 1.
    var NN = N * N;
    var Q = zeros(NN);
    for (var q = 0; q < NN; q++) {
      Q[q] = Math.max(Qu[q] / qsum, 1e-100);
    }

    var cost = 0.0;
    var grad = [];
    for (i = 0; i < N; i++) {
      var gsum = new Array(dim); // Initialize gradiet for point i.
      for (d = 0; d < dim; d++) {
        gsum[d] = 0.0;
      }
      for (j = 0; j < N; j++) {
        // Accumulate the cost.
        cost += -P[i * N + j] * Math.log(Q[i * N + j]);
        var premult = 4 * (pmul * P[i * N + j] - Q[i * N + j]) * Qu[i * N + j];
        for (d = 0; d < dim; d++) {
          gsum[d] += premult * (Y[i][d] - Y[j][d]);
        }
      }
      grad.push(gsum);
    }

    return {
      cost: cost,
      grad: grad
    };
  }

  /**
   * Calculates the stress. Basically, it computes the difference between
   * high dimensional distance and real distance. The lower the stress is,
   * the better layout.
   * @return {number} - stress of the layout.
   */
  function getStress() {
    var totalDiffSq = 0,
      totalHighDistSq = 0;
    for (var i = 0, source, target, realDist, highDist; i < nodes.length; i++) {
      for (var j = 0; j < nodes.length; j++) {
        if (i !== j) {
          source = nodes[i], target = nodes[j];
          realDist = Math.hypot(target.x - source.x, target.y - source.y);
          highDist = +distance(nodes[i], nodes[j]);
          totalDiffSq += Math.pow(realDist - highDist, 2);
          totalHighDistSq += highDist * highDist;
        }
      }
    }
    return Math.sqrt(totalDiffSq / totalHighDistSq);
  }

  // API for initializing the algorithm, setting parameters and querying
  // metrics.
  force.initialize = function(_) {
    nodes = _;
    N = nodes.length;
    // Initialize the probability matrix.
    P = d2p(nodes, perplexity, 1e-4);
    initSolution();
  };

  force.id = function(_) {
    return arguments.length ? (id = _, force) : id;
  };

  force.distance = function(_) {
    return arguments.length ? (distance = typeof _ === "function" ? _ : constant(+_), force) : distance;
  };

  force.stress = function() {
    return getStress();
  };

  force.learningRate = function(_) {
    return arguments.length ? (learningRate = +_, force) : learningRate;
  };

  force.perplexity = function(_) {
    return arguments.length ? (perplexity = +_, force) : perplexity;
  };

  return force;
}