// Copyright (c) 2020, Ashwin Narayan and Hyunghoon Cho

// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:

// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.

// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

// This is modified from the source code provided in the original implementation
// of t-SNE with the following copyright: 

/*
 *
 * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology)
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *    This product includes software developed by the Delft University of Technology.
 * 4. Neither the name of the Delft University of Technology nor the names of
 *    its contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS
 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
 * OF SUCH DAMAGE.
 *
 */

#include <cfloat>
#include <cmath>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <ctime>
#include "vptree.h"
#include "sptree.h"
#include "densne.h"
#include <Rcpp.h>


#ifdef _OPENMP
  #include <omp.h>
#endif

using namespace std;

static double sign(double x) { return (x == .0 ? .0 : (x < .0 ? -1.0 : 1.0)); }

static void zeroMean(double* X, int N, int D);
static void standardize(double* x, int N);
static void computeGaussianPerplexity(double* X, int N, int D, double* P, double perplexity, bool verbose);
static void computeGaussianPerplexity(double* X, int N, int D, unsigned int** _row_P, unsigned int** _col_P, double** _val_P, double** _val_D, double perplexity, int K, bool verbose);
static void computeExactGradient(double* P, double* Y, int N, int D, double* dC);
static void computeGradient(unsigned int* inp_row_P, unsigned int* inp_col_P, double* inp_val_P, double* Y, int N, int D, double* dC, double theta);
static void computeGradientDens(unsigned int* inp_row_P, unsigned int* inp_col_P, double* inp_val_P, double* Y, int N, int D, double* dC, double theta, double* R, double logdist_shift, double dens_lambda, double var_shift);
static double evaluateError(double* P, double* Y, int N, int D);
static double evaluateError(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, int N, int D, double theta);
static double evaluateErrorDens(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, int N, int D, double theta, double logdist_shift, double dens_lambda, double var_shift, double* R, double* rhosq, double* re);
static void computeSquaredEuclideanDistance(double* X, int N, int D, double* DD);
static void symmetrizeMatrix(unsigned int** row_P, unsigned int** col_P, double** val_P, double** val_D, unsigned int N);
static void computeExpectedLogDist(double** out, unsigned int* _row_P, unsigned int* _col_P, double* _val_P, double* _val_D, int N, double logdist_shift);


// Perform t-SNE
void DENSNE::run(
        double* X, int N, int D, double* Y, double* dens, int no_dims, 
        double perplexity, double theta, bool skip_random_init, int max_iter,
        double momentum, double final_momentum, double eta,
        int stop_lying_iter, int mom_switch_iter, double dens_frac, 
        double dens_lambda, bool final_dens, int num_threads, bool verbose) {

    if (verbose) {
        Rprintf("dens_frac: %f\n", dens_frac);
        Rprintf("dens_lambda: %f\n", dens_lambda);
        Rprintf("final_dens: %d\n", final_dens);
    }
    #ifdef _OPENMP
      int threads = num_threads;
      if (num_threads==0) {
        threads = omp_get_max_threads();
      }
      
      // Print notice whether OpenMP is used
      if (verbose) Rprintf("OpenMP is working. %d threads.\n", threads);
    #endif

    // Determine whether we are using an exact algorithm
    if(N - 1 < 3 * perplexity) { 
        Rcpp::stop("Perplexity too large for the number of data points!\n");
    }
    if (verbose) {
        Rprintf("Using no_dims = %d, perplexity = %f, and theta = %f\n", no_dims, perplexity, theta);
    }
    bool exact = (theta == .0) ? true : false;

    if (exact) { Rcpp::stop("den-SNE based on exact t-SNE is currently unsupported\n"); }

    // Set learning parameters
    float total_time = .0;
    clock_t start, end;
    // double momentum = .5, final_momentum = .8;
    // double eta = 200.0;
    double logdist_shift = 0;
    double var_shift = 0.1;

    // Allocate some memory
    double* dY    = (double*) malloc(N * no_dims * sizeof(double));
    double* uY    = (double*) malloc(N * no_dims * sizeof(double));
    double* gains = (double*) malloc(N * no_dims * sizeof(double));
    if(dY == NULL || uY == NULL || gains == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
    for(int i = 0; i < N * no_dims; i++)    uY[i] =  .0;
    for(int i = 0; i < N * no_dims; i++) gains[i] = 1.0;

    // Normalize input data (to prevent numerical problems)
    if (verbose) {
        Rprintf("Computing input similarities...\n");
    }
    start = clock();
    zeroMean(X, N, D);
    double max_X = .0;
    for(int i = 0; i < N * D; i++) {
        if(fabs(X[i]) > max_X) max_X = fabs(X[i]);
    }
    for(int i = 0; i < N * D; i++) X[i] /= max_X;

    // Compute input similarities for exact t-SNE
    double* P; unsigned int* row_P; unsigned int* col_P; double* val_P; double* val_D;
    if(exact) {

        // Compute similarities
        if (verbose) {
            Rprintf("Exact?");
        }
        P = (double*) malloc(N * N * sizeof(double));
        if(P == NULL) { Rcpp::stop("Memory allocation failed!\n");}
        computeGaussianPerplexity(X, N, D, P, perplexity, verbose);

        // Symmetrize input similarities
        if (verbose) {
            Rprintf("Symmetrizing...\n");
        }
        int nN = 0;
        for(int n = 0; n < N; n++) {
            int mN = (n + 1) * N;
            for(int m = n + 1; m < N; m++) {
                P[nN + m] += P[mN + n];
                P[mN + n]  = P[nN + m];
                mN += N;
            }
            nN += N;
        }
        double sum_P = .0;
        for(int i = 0; i < N * N; i++) sum_P += P[i];
        for(int i = 0; i < N * N; i++) P[i] /= sum_P;
    }

    // Compute input similarities for approximate t-SNE
    else {

        // Compute asymmetric pairwise input similarities
        computeGaussianPerplexity(X, N, D, &row_P, &col_P, &val_P, &val_D, perplexity, (int) (3 * perplexity), verbose);

        // Symmetrize input similarities
        symmetrizeMatrix(&row_P, &col_P, &val_P, &val_D, N);

        double sum_P = .0;
        for(unsigned int i = 0; i < row_P[N]; i++) sum_P += val_P[i];
        for(unsigned int i = 0; i < row_P[N]; i++) val_P[i] /= sum_P;
    }
    end = clock();

    // Compute expected log distance for each point
    double* ro;
    computeExpectedLogDist(&ro, row_P, col_P, val_P, val_D, N, logdist_shift);
    free(val_D); val_D = NULL;

    if (final_dens) {
      // Save to output variable
      for (int n = 0; n < N; n++) {
        dens[2 * n] = ro[n]; 
      }
    }

    // Whiten to simplify correlation objective
    standardize(ro, N);

    // Initialize solution (randomly)
    if (!skip_random_init) {

      for(int i = 0; i < N * no_dims; i++) Y[i] = R::rnorm(0, 1) * .0001;
      
      // Lie about the P-values (only for random initialization)
      if(exact) { for(int i = 0; i < N * N; i++)        P[i] *= 12.0; }
      else {      for(unsigned int i = 0; i < row_P[N]; i++) val_P[i] *= 12.0; }

    }

    // Perform main training loop
    if (verbose) {
        if(exact) {
            Rprintf("Input similarities computed in %4.2f seconds!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC);
        } else {
            Rprintf("Input similarities computed in %4.2f seconds (sparsity = %f)!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC, (double) row_P[N] / ((double) N * (double) N));
        }
    }
    start = clock();

    double* re = (double*) calloc(N, sizeof(double));
    if (re == NULL) { Rcpp::stop("Memory allocation failed!\n"); }

    bool dens_flag = false;
    for(int iter = 1; iter <= max_iter; iter++) {
        Rcpp::checkUserInterrupt();
        // Compute (approximate) gradient
        if (exact) {
          computeExactGradient(P, Y, N, no_dims, dY);
        } else if (iter < (1 - dens_frac) * max_iter || fabs(dens_lambda) < FLT_MIN) {
          computeGradient(row_P, col_P, val_P, Y, N, no_dims, dY, theta);
        } else {
          if (!dens_flag) {
            if (verbose) {
              Rprintf("Iteration %d: density optimization switched on\n", iter);
            }
            dens_flag = true;
          }
          computeGradientDens(row_P, col_P, val_P, Y, N, no_dims, dY, theta,
                              ro, logdist_shift, dens_lambda, var_shift);
        }

        // Update gains
        for(int i = 0; i < N * no_dims; i++) gains[i] = (sign(dY[i]) != sign(uY[i])) ? (gains[i] + .2) : (gains[i] * .8);
        for(int i = 0; i < N * no_dims; i++) if(gains[i] < .01) gains[i] = .01;

        // Perform gradient update (with momentum and gains)
        for(int i = 0; i < N * no_dims; i++) uY[i] = momentum * uY[i] - eta * gains[i] * dY[i];
        for(int i = 0; i < N * no_dims; i++)  Y[i] = Y[i] + uY[i];

        // Make solution zero-mean
            zeroMean(Y, N, no_dims);

        // Stop lying about the P-values after a while, and switch momentum
        if(!skip_random_init && iter == stop_lying_iter) {
            if(exact) { for(int i = 0; i < N * N; i++)        P[i] /= 12.0; }
            else      { for(unsigned int i = 0; i < row_P[N]; i++) val_P[i] /= 12.0; }
        }
        if(iter == mom_switch_iter) momentum = final_momentum;

        // Print out progress
        if (iter % 50 == 0 || iter == max_iter) {
            end = clock();

            double C = .0;
            double rho = .0;

            if (exact) {
              C = evaluateError(P, Y, N, no_dims);
            } else if (dens_flag) {
              C = evaluateErrorDens(row_P, col_P, val_P, Y, N, no_dims, theta,
                                    logdist_shift, dens_lambda, var_shift, ro, &rho, re);
            } else {
              C = evaluateError(row_P, col_P, val_P, Y, N, no_dims, theta);  // doing approximate computation here!
            }

            float elapsed_time = (float) (end - start) / CLOCKS_PER_SEC;
            total_time += elapsed_time;

            if (verbose) {
              if (dens_flag) {
                Rprintf("Iteration %d: error is %f, rho = %f", iter, C, rho);
              } else {
                Rprintf("Iteration %d: error is %f", iter, C);
              }
              Rprintf(" (50 iterations in %4.2f seconds)\n", elapsed_time);
            }
            start = clock();
        }
    }

    end = clock(); total_time += (float) (end - start) / CLOCKS_PER_SEC;

    // Clean up memory
    free(dY);
    free(uY);
    free(gains);
    if(exact) free(P);
    else {
        free(row_P); row_P = NULL;
        free(col_P); col_P = NULL;
        free(val_P); val_P = NULL;
    }
    free(ro);

    if (verbose) {
      Rprintf("Fitting performed in %4.2f seconds.\n", total_time);
    }

    if (re != NULL) {
      free(re); re = NULL;
    }

    // Compute embedding density
    if (final_dens) {
        if (verbose) {
          Rprintf("Computing embedding local densities\n");
        }

        computeGaussianPerplexity(Y, N, 2, &row_P, &col_P, &val_P, &val_D, perplexity, (int) (3 * perplexity), verbose);
        symmetrizeMatrix(&row_P, &col_P, &val_P, &val_D, N);

        // Compute expected log distance for each point
        computeExpectedLogDist(&re, row_P, col_P, val_P, val_D, N, logdist_shift);
        free(val_D); val_D = NULL;

        // Save to output variable
        for (int n = 0; n < N; n++) {
          dens[2 * n + 1] = re[n]; 
        }
        free(re);
        
        // Clean up memory
        free(row_P); row_P = NULL;
        free(col_P); col_P = NULL;
        free(val_P); val_P = NULL;
    }

}

// Compute gradient of the t-SNE cost function (using Barnes-Hut algorithm)
static void computeGradient(unsigned int* inp_row_P, unsigned int* inp_col_P, double* inp_val_P, double* Y, int N, int D, double* dC, double theta)
{

    // Construct space-partitioning tree on current map
    SPTree* tree = new SPTree(D, Y, N);

    // Compute all terms required for t-SNE gradient
    double sum_Q = .0;
    double* pos_f = (double*) calloc(N * D, sizeof(double));
    double* neg_f = (double*) calloc(N * D, sizeof(double));
    if(pos_f == NULL || neg_f == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
    tree->computeEdgeForces(inp_row_P, inp_col_P, inp_val_P, N, pos_f, NULL, NULL, 0);

    for(int n = 0; n < N; n++) {
        tree->computeNonEdgeForces(n, theta, neg_f + n * D, &sum_Q);
    }
    
    // Compute final t-SNE gradient
    for(int i = 0; i < N * D; i++) {
        dC[i] = pos_f[i] - (neg_f[i] / sum_Q);
    }

    free(pos_f);
    free(neg_f);
    delete tree;
}

// Compute gradient of the t-SNE cost function (using Barnes-Hut algorithm)
static void computeGradientDens(unsigned int* inp_row_P, unsigned int* inp_col_P, double* inp_val_P, double* Y, int N, int D, double* dC, double theta, double* R, double logdist_shift, double dens_lambda, double var_shift)
{

    // Construct space-partitioning tree on current map
    SPTree* tree = new SPTree(D, Y, N);

    // Compute all terms required for t-SNE gradient
    double sum_Q = .0;
    double* pos_f = (double*) calloc(N * D, sizeof(double));
    double* neg_f = (double*) calloc(N * D, sizeof(double));
    double* dens_f = (double*) calloc(N * D, sizeof(double)); // density gradients

    double* re = (double*) calloc(N, sizeof(double)); // embedding densities
    double* q_norm = (double*) calloc(N, sizeof(double)); // normalization factors

    if (pos_f == NULL || neg_f == NULL || dens_f == NULL || re == NULL || q_norm == NULL) {
      Rcpp::stop("Memory allocation failed!\n"); 
    }

    tree->computeEdgeForces(inp_row_P, inp_col_P, inp_val_P, N, pos_f,
                            re, q_norm, logdist_shift); // re and q_norm updated

    for (int n = 0; n < N; n++) {
      tree->computeNonEdgeForces(n, theta, neg_f + n * D, &sum_Q);
    }
    
    // Compute gradient of density correlation term
    tree->computeEdgeForcesDens(inp_row_P, inp_col_P, inp_val_P, N, dens_f,
                                R, re, q_norm, logdist_shift, var_shift);

    // Compute final t-SNE gradient
    for(int i = 0; i < N * D; i++) {
        dC[i] = pos_f[i] - (neg_f[i] / sum_Q) - dens_lambda * dens_f[i];
    }

    free(pos_f);
    free(neg_f);
    free(dens_f);
    free(re);
    free(q_norm);
    delete tree;
}

// Compute gradient of the t-SNE cost function (exact)
static void computeExactGradient(double* P, double* Y, int N, int D, double* dC) {

    // Make sure the current gradient contains zeros
    for(int i = 0; i < N * D; i++) dC[i] = 0.0;

    // Compute the squared Euclidean distance matrix
    double* DD = (double*) malloc(N * N * sizeof(double));
    if(DD == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
    computeSquaredEuclideanDistance(Y, N, D, DD);

    // Compute Q-matrix and normalization sum
    double* Q    = (double*) malloc(N * N * sizeof(double));
    if(Q == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
    double sum_Q = .0;
    int nN = 0;
    for(int n = 0; n < N; n++) {
        for(int m = 0; m < N; m++) {
            if(n != m) {
                Q[nN + m] = 1 / (1 + DD[nN + m]);
                sum_Q += Q[nN + m];
            }
        }
        nN += N;
    }

    // Perform the computation of the gradient
    nN = 0;
    int nD = 0;
    for(int n = 0; n < N; n++) {
        int mD = 0;
        for(int m = 0; m < N; m++) {
            if(n != m) {
                double mult = (P[nN + m] - (Q[nN + m] / sum_Q)) * Q[nN + m];
                for(int d = 0; d < D; d++) {
                    dC[nD + d] += (Y[nD + d] - Y[mD + d]) * mult;
                }
            }
            mD += D;
        }
        nN += N;
        nD += D;
    }

    // Free memory
    free(DD); DD = NULL;
    free(Q);  Q  = NULL;
}


// Evaluate t-SNE cost function (exactly)
static double evaluateError(double* P, double* Y, int N, int D) {

    // Compute the squared Euclidean distance matrix
    double* DD = (double*) malloc(N * N * sizeof(double));
    double* Q = (double*) malloc(N * N * sizeof(double));
    if(DD == NULL || Q == NULL) { Rcpp::stop("Memory allocation failed!\n");  }
    computeSquaredEuclideanDistance(Y, N, D, DD);

    // Compute Q-matrix and normalization sum
    int nN = 0;
    double sum_Q = DBL_MIN;
    for(int n = 0; n < N; n++) {
        for(int m = 0; m < N; m++) {
            if(n != m) {
                Q[nN + m] = 1 / (1 + DD[nN + m]);
                sum_Q += Q[nN + m];
            }
            else Q[nN + m] = DBL_MIN;
        }
        nN += N;
    }
    for(int i = 0; i < N * N; i++) Q[i] /= sum_Q;

    // Sum t-SNE error
    double C = .0;
    for(int n = 0; n < N * N; n++) {
        C += P[n] * log((P[n] + FLT_MIN) / (Q[n] + FLT_MIN));
    }

    // Clean up memory
    free(DD);
    free(Q);
    return C;
}

// Evaluate t-SNE cost function (approximately)
static double evaluateError(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, int N, int D, double theta)
{

    // Get estimate of normalization term
    SPTree* tree = new SPTree(D, Y, N);
    double* buff = (double*) calloc(D, sizeof(double));
    double sum_Q = .0;
    for(int n = 0; n < N; n++) tree->computeNonEdgeForces(n, theta, buff, &sum_Q);

    // Loop over all edges to compute t-SNE error
    int ind1, ind2;
    double C = .0, Q;
    for(int n = 0; n < N; n++) {
        ind1 = n * D;
        for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) {
            Q = .0;
            ind2 = col_P[i] * D;
            for(int d = 0; d < D; d++) buff[d]  = Y[ind1 + d];
            for(int d = 0; d < D; d++) buff[d] -= Y[ind2 + d];
            for(int d = 0; d < D; d++) Q += buff[d] * buff[d];
            Q = (1.0 / (1.0 + Q)) / sum_Q;
            C += val_P[i] * log((val_P[i] + FLT_MIN) / (Q + FLT_MIN));
        }
    }

    // Clean up memory
    free(buff);
    delete tree;
    return C;
}

// Evaluate t-SNE cost function (approximately)
static double evaluateErrorDens(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, int N, int D, double theta, double logdist_shift, double dens_lambda, double var_shift, double* R, double* rho, double* re)
{

    // Get estimate of normalization term
    SPTree* tree = new SPTree(D, Y, N);

    bool re_local = re == NULL;
    if (re_local) {
      re = (double*) calloc(N, sizeof(double));
    }

    double* buff = (double*) calloc(D, sizeof(double));

    if(re == NULL || buff == NULL) { Rcpp::stop("Memory allocation failed!\n"); }

    double sum_Q = .0;
    for(int n = 0; n < N; n++) tree->computeNonEdgeForces(n, theta, buff, &sum_Q);

    // Loop over all edges to compute t-SNE error
    int ind1, ind2;
    double C = .0, Q;
    for(int n = 0; n < N; n++) {
        ind1 = n * D;

        double rphi_sum = 0.0;
        double phi_sum = 0.0;
        for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) {
            double dist = 0.0;
            ind2 = col_P[i] * D;
            for(int d = 0; d < D; d++) buff[d]  = Y[ind1 + d];
            for(int d = 0; d < D; d++) buff[d] -= Y[ind2 + d];
            for(int d = 0; d < D; d++) dist += buff[d] * buff[d];

            double r = dist;
            double tdist = 1.0 / (1.0 + dist);

            double phi = tdist;
            rphi_sum += r * phi;
            phi_sum += phi;

            Q = tdist / sum_Q;
            C += val_P[i] * log((val_P[i] + FLT_MIN) / (Q + FLT_MIN));
        }

        re[n] = log(logdist_shift + rphi_sum / phi_sum);
    }

    double re_mean = 0.0;
    double re_std = 0.0;
    for (int n = 0; n < N; n++) re_mean += re[n];
    re_mean /= N;
    for (int n = 0; n < N; n++) re_std += (re[n] - re_mean) * (re[n] - re_mean);
    re_std /= (N - 1);
    re_std = sqrt(re_std + var_shift);

    double cor = 0.0;
    for (int n = 0; n < N; n++) {
      cor += (re[n] - re_mean) / re_std * R[n];
    }
    cor /= (N - 1);

    C -= dens_lambda * cor;
    *rho = cor;

    // Clean up memory
    if (re_local) {
      free(re); re = NULL;
    }
    free(buff);
    delete tree;
    return C;
}

// Compute input similarities with a fixed perplexity
static void computeGaussianPerplexity(double* X, int N, int D, double* P, double perplexity, bool verbose) {

    // Compute the squared Euclidean distance matrix
    double* DD = (double*) malloc(N * N * sizeof(double));
    if(DD == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
    computeSquaredEuclideanDistance(X, N, D, DD);

    // Compute the Gaussian kernel row by row
    int nN = 0;
    #pragma omp parallel for schedule(guided)
    for(int n = 0; n < N; n++) {

        // Initialize some variables
        bool found = false;
        double beta = 1.0;
        double min_beta = -DBL_MAX;
        double max_beta =  DBL_MAX;
        double tol = 1e-5;
        double sum_P;

        // Iterate until we found a good perplexity
        int iter = 0;
        while(!found && iter < 200) {

            // Compute Gaussian kernel row
            for(int m = 0; m < N; m++) P[nN + m] = exp(-beta * DD[nN + m]);
            P[nN + n] = DBL_MIN;

            // Compute entropy of current row
            sum_P = DBL_MIN;
            for(int m = 0; m < N; m++) sum_P += P[nN + m];
            double H = 0.0;
            for(int m = 0; m < N; m++) H += beta * (DD[nN + m] * P[nN + m]);
            H = (H / sum_P) + log(sum_P);

            // Evaluate whether the entropy is within the tolerance level
            double Hdiff = H - log(perplexity);
            if(Hdiff < tol && -Hdiff < tol) {
                found = true;
            }
            else {
                if(Hdiff > 0) {
                    min_beta = beta;
                    if(max_beta == DBL_MAX || max_beta == -DBL_MAX)
                        beta *= 2.0;
                    else
                        beta = (beta + max_beta) / 2.0;
                }
                else {
                    max_beta = beta;
                    if(min_beta == -DBL_MAX || min_beta == DBL_MAX)
                        beta /= 2.0;
                    else
                        beta = (beta + min_beta) / 2.0;
                }
            }

            // Update iteration counter
            iter++;
        }

        // Row normalize P
        for(int m = 0; m < N; m++) P[nN + m] /= sum_P;
        nN += N;
    }

    // Clean up memory
    free(DD); DD = NULL;
}


// Compute input similarities with a fixed perplexity using ball trees (this function allocates memory another function should free)
static void computeGaussianPerplexity(double* X, int N, int D, unsigned int** _row_P, unsigned int** _col_P, double** _val_P, double** _val_D, double perplexity, int K, bool verbose) {

    if(perplexity > K) Rprintf("Perplexity should be lower than K!\n");

    // Allocate the memory we need
    *_row_P = (unsigned int*)    malloc((N + 1) * sizeof(unsigned int));
    *_col_P = (unsigned int*)    calloc(N * K, sizeof(unsigned int));
    *_val_P = (double*) calloc(N * K, sizeof(double));
    *_val_D = (double*) calloc(N * K, sizeof(double));
    if(*_row_P == NULL || *_col_P == NULL || *_val_P == NULL || *_val_D == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
    unsigned int* row_P = *_row_P;
    unsigned int* col_P = *_col_P;
    double* val_P = *_val_P;
    double* val_D = *_val_D;
    double* cur_P = (double*) malloc((N - 1) * sizeof(double));
    if(cur_P == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
    row_P[0] = 0;
    for(int n = 0; n < N; n++) row_P[n + 1] = row_P[n] + (unsigned int) K;

    // Build ball tree on data set
    VpTree<DataPoint, euclidean_distance>* tree = new VpTree<DataPoint, euclidean_distance>();
    vector<DataPoint> obj_X(N, DataPoint(D, -1, X));
    for(int n = 0; n < N; n++) obj_X[n] = DataPoint(D, n, X + n * D);
    tree->create(obj_X);

    // Loop over all points to find nearest neighbors
    if (verbose) {
      Rprintf("Building tree...\n");
    }
    vector<DataPoint> indices;
    vector<double> distances;
    for(int n = 0; n < N; n++) {

        if (verbose) {
          if(n % 10000 == 0) Rprintf(" - point %d of %d\n", n, N);
        }

        // Find nearest neighbors
        indices.clear();
        distances.clear();
        tree->search(obj_X[n], K + 1, &indices, &distances);

        // Initialize some variables for binary search
        bool found = false;
        double beta = 1.0;
        double min_beta = -DBL_MAX;
        double max_beta =  DBL_MAX;
        double tol = 1e-5;

        // Iterate until we found a good perplexity
        int iter = 0; double sum_P;
        while(!found && iter < 200) {

            // Compute Gaussian kernel row
            for(int m = 0; m < K; m++) cur_P[m] = exp(-beta * distances[m + 1] * distances[m + 1]);

            // Compute entropy of current row
            sum_P = DBL_MIN;
            for(int m = 0; m < K; m++) sum_P += cur_P[m];
            double H = .0;
            for(int m = 0; m < K; m++) H += beta * (distances[m + 1] * distances[m + 1] * cur_P[m]);
            H = (H / sum_P) + log(sum_P);

            // Evaluate whether the entropy is within the tolerance level
            double Hdiff = H - log(perplexity);
            if(Hdiff < tol && -Hdiff < tol) {
                found = true;
            }
            else {
                if(Hdiff > 0) {
                    min_beta = beta;
                    if(max_beta == DBL_MAX || max_beta == -DBL_MAX)
                        beta *= 2.0;
                    else
                        beta = (beta + max_beta) / 2.0;
                }
                else {
                    max_beta = beta;
                    if(min_beta == -DBL_MAX || min_beta == DBL_MAX)
                        beta /= 2.0;
                    else
                        beta = (beta + min_beta) / 2.0;
                }
            }

            // Update iteration counter
            iter++;
        }

        // Row-normalize current row of P and store in matrix
        for(int m = 0; m < K; m++) cur_P[m] /= sum_P;
        for(int m = 0; m < K; m++) {
            col_P[row_P[n] + m] = (unsigned int) indices[m + 1].index();
            val_P[row_P[n] + m] = cur_P[m];
            val_D[row_P[n] + m] = distances[m + 1] * distances[m + 1];
        }
    }

    // Clean up memory
    obj_X.clear();
    free(cur_P);
    delete tree;
}

static void computeExpectedLogDist(double** _out, unsigned int* row_P, unsigned int* col_P, double* val_P, double* val_D, int N, double logdist_shift) {

    *_out = (double*) malloc(N * sizeof(double));
    if (*_out == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
    double* out = *_out;

    for (int n = 0; n < N; n++) {
      double sum_PD = 0;
      double sum_P = 0;
      for (unsigned int i = row_P[n]; i < row_P[n + 1]; i++) {
        sum_PD += val_P[i] * val_D[i];
        sum_P += val_P[i];
      }
      out[n] = log(logdist_shift + sum_PD / sum_P);
    }
}

// Symmetrizes a sparse matrix
static void symmetrizeMatrix(unsigned int** _row_P, unsigned int** _col_P, double** _val_P, double** _val_D, unsigned int N) {

    // Get sparse matrix
    unsigned int* row_P = *_row_P;
    unsigned int* col_P = *_col_P;
    double* val_P = *_val_P;
    double* val_D = *_val_D;

    // Count number of elements and row counts of symmetric matrix
    int* row_counts = (int*) calloc(N, sizeof(int));
    if(row_counts == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
    for(unsigned int n = 0; n < N; n++) {
        for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) {

            // Check whether element (col_P[i], n) is present
            bool present = false;
            for(unsigned int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) {
                if(col_P[m] == n) present = true;
            }
            if(present) row_counts[n]++;
            else {
                row_counts[n]++;
                row_counts[col_P[i]]++;
            }
        }
    }
    int no_elem = 0;
    for(unsigned int n = 0; n < N; n++) no_elem += row_counts[n];

    // Allocate memory for symmetrized matrix
    unsigned int* sym_row_P = (unsigned int*) malloc((N + 1) * sizeof(unsigned int));
    unsigned int* sym_col_P = (unsigned int*) malloc(no_elem * sizeof(unsigned int));
    double* sym_val_P = (double*) malloc(no_elem * sizeof(double));
    double* sym_val_D = (double*) malloc(no_elem * sizeof(double));
    if(sym_row_P == NULL || sym_col_P == NULL || sym_val_P == NULL || sym_val_D == NULL) { Rcpp::stop("Memory allocation failed!\n"); }

    // Construct new row indices for symmetric matrix
    sym_row_P[0] = 0;
    for(unsigned int n = 0; n < N; n++) sym_row_P[n + 1] = sym_row_P[n] + (unsigned int) row_counts[n];

    // Fill the result matrix
    int* offset = (int*) calloc(N, sizeof(int));
    if(offset == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
    for(unsigned int n = 0; n < N; n++) {
        for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) {                                  // considering element(n, col_P[i])

            // Check whether element (col_P[i], n) is present
            bool present = false;
            for(unsigned int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) {
                if(col_P[m] == n) {
                    present = true;
                    if(n <= col_P[i]) {                                                 // make sure we do not add elements twice
                        sym_col_P[sym_row_P[n]        + offset[n]]        = col_P[i];
                        sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n;
                        sym_val_P[sym_row_P[n]        + offset[n]]        = val_P[i] + val_P[m];
                        sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_P[i] + val_P[m];
                    }
                }
            }

            // If (col_P[i], n) is not present, there is no addition involved
            if(!present) {
                sym_col_P[sym_row_P[n]        + offset[n]]        = col_P[i];
                sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n;
                sym_val_P[sym_row_P[n]        + offset[n]]        = val_P[i];
                sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_P[i];
            }

                sym_val_D[sym_row_P[n] + offset[n]] = val_D[i];
                sym_val_D[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_D[i]; 

            // Update offsets
            if(!present || (present && n <= col_P[i])) {
                offset[n]++;
                if(col_P[i] != n) offset[col_P[i]]++;
            }
        }
    }

    // Divide the result by two
    for(int i = 0; i < no_elem; i++) sym_val_P[i] /= 2.0;

    // Return symmetrized matrices
    free(*_row_P); *_row_P = sym_row_P;
    free(*_col_P); *_col_P = sym_col_P;
    free(*_val_P); *_val_P = sym_val_P;
    free(*_val_D); *_val_D = sym_val_D;

    // Free up some memery
    free(offset); offset = NULL;
    free(row_counts); row_counts  = NULL;
}

// Compute squared Euclidean distance matrix
static void computeSquaredEuclideanDistance(double* X, int N, int D, double* DD) {
    const double* XnD = X;
    for(int n = 0; n < N; ++n, XnD += D) {
        const double* XmD = XnD + D;
        double* curr_elem = &DD[n*N + n];
        *curr_elem = 0.0;
        double* curr_elem_sym = curr_elem + N;
        for(int m = n + 1; m < N; ++m, XmD+=D, curr_elem_sym+=N) {
            *(++curr_elem) = 0.0;
            for(int d = 0; d < D; ++d) {
                *curr_elem += (XnD[d] - XmD[d]) * (XnD[d] - XmD[d]);
            }
            *curr_elem_sym = *curr_elem;
        }
    }
}

static void standardize(double* x, int N) {
  double mean = 0;
  for (int n = 0; n < N; n++) {
    mean += x[n];
  }
  mean /= N;

  double stdev = 0;
  for (int n = 0; n < N; n++) {
    stdev += (x[n] - mean) * (x[n] - mean);
  }
  stdev = sqrt(stdev / (N - 1));
  
  for (int n = 0; n < N; n++) {
    x[n] = (x[n] - mean) / stdev;
  }
}

// Makes data zero-mean
static void zeroMean(double* X, int N, int D) {

    // Compute data mean
    double* mean = (double*) calloc(D, sizeof(double));
    if(mean == NULL) { Rcpp::stop("Memory allocation failed!\n"); }
    int nD = 0;
    for(int n = 0; n < N; n++) {
        for(int d = 0; d < D; d++) {
            mean[d] += X[nD + d];
        }
        nD += D;
    }
    for(int d = 0; d < D; d++) {
        mean[d] /= (double) N;
    }

    // Subtract data mean
    nD = 0;
    for(int n = 0; n < N; n++) {
        for(int d = 0; d < D; d++) {
            X[nD + d] -= mean[d];
        }
        nD += D;
    }
    free(mean); mean = NULL;
}
