#!/usr/bin/env Rscript

# Read CT filename as a command line argument.
args <-commandArgs(TRUE)
# Check if anything was read.
if (length(args) ==0) {
  cat("Usage: ./RsampleCluster.R stochastic_sample.ct\n")
  stop("CT file with stochastic sample not provided.\n")
}
# Check if CT file exists.
filename <-as.character(args[1])
if ( !file.exists(filename) ) {
  stop("CT file with stochastic sample: ",filename," does not exist.\n")
}

# Alternatively, if one wishes to run the script from within R,
# filename should be provided in the following line and all the lines above
# should be commented out.
#filename="stochastic.1000.ct"

name=strsplit(filename,".ct")
cat("Filename: ",filename,"\n")
# Cluster into optimal # of clusters or specified number of clusters.
# If type=0 optimal # of clusters will be determined, otherwise number specified
# in type variable will be used
type=0
if ( type > 1) cat('Clustering into',type,'clusters\n')

# Get number of nucleotides and conformations.
no_nucl<-scan(file=filename,what=integer(),nmax=1,nlines=1)
no_lines<-length(readLines(filename))
no_struct <- no_lines/(no_nucl+1)
confi <- matrix(NA_integer_, nrow=no_struct, ncol=no_nucl)
# Read sequence from CT file.
seq <- as.matrix(read.table(filename,skip=1,nrows=no_nucl,colClasses=c("NULL","character","NULL","NULL","NULL","NULL")))
# Read the whole CT file.
for (i in 1:no_struct) {
  confi[i,] <- as.matrix(read.table(filename,header=FALSE,nrows=no_nucl,skip=(i-1)*(no_nucl+1)+1,colClasses=c("NULL","NULL","NULL","NULL","integer","NULL")))
}
# Make pairing data matrix.
pairing_data <-array(0L,dim=c(no_struct,no_nucl,no_nucl))
for (i in 1:no_struct) {
  for (j in 1:no_nucl) {
    if (confi[i,j] != 0) {
      pairing_data[i,j,confi[i,j]] <- 1L
    }
  }
}
# Calculate distance matrix
# Get about 2x speedup by using Fortran to compute distance_matrix.
# Files calcdistf90.R and calcdist.f90 are in Rsample directory. They
# should be in same directory as this file.
if ( file.exists("calcdistf90.R") && file.exists("calcdist.so") ) {
  source("calcdistf90.R")
  distance_matrix <- calcdistf90(pairing_data)
} else {
  # use R to compute distance_matrix. This is 2x slower than Fortran.
  distance_matrix <- matrix(NA_integer_, nrow=no_struct, ncol=no_struct)
  for (i in 1:no_struct) {
    for (j in 1:no_struct) {
      distance_matrix[i,j] <- sum(abs(pairing_data[i,,]-pairing_data[j,,]))
    }
  }
}

# Do Diana clustering.
dv <- cluster::diana(distance_matrix,diss=TRUE,metric="manhattan",stand=TRUE)
# Split dv into clusters of size 2 to 10
dvc <- matrix(NA_real_, nrow=9, ncol=no_struct)
for (i in 1:9) {
  dvc[i,] <- cutree(dv,k=i+1)
}
# Calculate CH index for each cluster size using CH index.
ch_ind <- double(9)
for (i in 1:9) {
  temp <- fpc::cluster.stats(distance_matrix,dvc[i,])
  ch_ind[i] <- temp$ch
}
# Find cluster with max CH index.
if (type == 0 ) opt_clust_size <- which.max(ch_ind) else opt_clust_size <- type-1
# Find centroids of clusters and number of elements in each cluster.
clust_size <- integer(opt_clust_size+1)
for (i in 1:(opt_clust_size+1)) {
  clust_size[i] <- length(which(dvc[opt_clust_size,]==i))
}
cat("Optimal # of clusters:",opt_clust_size+1,"\n")
for (i in 1:(opt_clust_size+1)) {
  cat("Size of cluster",i,clust_size[i])
  cat("\n")
}
# Calculate average # of bps in each cluster.
bp_sum_cluster <- array(NA_real_,dim=c(opt_clust_size+1,no_nucl,no_nucl))
for (i in 1:(opt_clust_size+1)) {
  # temp is a subset of pairing_data matrix which belong to a same cluster
  temp <- array(0,dim=c(clust_size[i],no_nucl,no_nucl))
  temp <- pairing_data[which(dvc[opt_clust_size,]==i),,]
  # If one dimension is 1 i.e. if cluster has 1 element then temp[,j,k] crashes so one needs to separate the cases 
  if (clust_size[i] > 1) 
    for (j in 1:no_nucl) {
      for (k in 1:no_nucl) {
        bp_sum_cluster[i,j,k] <- sum(temp[,j,k]/clust_size[i])
      }
    }
  else
    for (j in 1:no_nucl) {
      for (k in 1:no_nucl) {
        bp_sum_cluster[i,j,k] <- sum(temp[j,k]/clust_size[i])
      }
    }  
}
# Find max and bp_probability for every nt in every cluster.
max_position_cluster <- array(NA_integer_,dim=c(opt_clust_size+1,no_nucl))
bp_prob_cluster <- array(NA_real_,dim=c(opt_clust_size+1,no_nucl))
for (i in 1:(opt_clust_size+1)) {
  for (j in 1:no_nucl) {
    max_position_cluster[i,j] <- which.max(bp_sum_cluster[i,j,])
    bp_prob_cluster[i,j] <- bp_sum_cluster[i,j,max_position_cluster[i,j]]
  }
}
# Determine centroid by using only probability > 0.5.
centroid <- array(0L,dim=c(opt_clust_size+1,no_nucl))
for (i in 1:(opt_clust_size+1)) {
  for (j in 1:no_nucl) {
    if (bp_prob_cluster[i,j] > 0.5) {
      centroid[i,j] <- max_position_cluster[i,j]
    }
  }
}
# Print centroid structures to files.
for (i in 1:(opt_clust_size+1)) {
  cat(no_nucl,as.character(name),"centroid",i,"\n",file=paste(as.character(name),".centroid.",i,".ct",sep=""),append=FALSE)
  for (j in 1:no_nucl) {
    cat(j,seq[j],j-1,j+1,centroid[i,j],j,"\n",sep="  ",file=paste(as.character(name),".centroid.",i,".ct",sep=""),append=TRUE)
  }
}