/* uproc-makedb
 * Create a new uproc database.
 *
 * Copyright 2014 Peter Meinicke, Robin Martinjak
 *
 * This file is part of uproc.
 *
 * uproc is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * uproc is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with uproc.  If not, see <http://www.gnu.org/licenses/>.
 */


#if HAVE_CONFIG_H
#include <config.h>
#endif

#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <ctype.h>

#include <uproc.h>
#include "common.h"
#include "makedb.h"

unsigned long filtered_counts[UPROC_FAMILY_MAX] = { 0 };

struct ecurve_entry
{
    struct uproc_word word;
    uproc_family family;
    uproc_family tax_X;
    uproc_family tax_Y;
};

struct return_struct {         /* deklariert den Strukturtyp person */
char  *s;
uproc_family xdim;
uproc_family ydim;
};

//static char *
struct return_struct
crop_first_word(char *s)
{
char *p1, *p2, *p3;
int iter, iter2;
size_t len;
    
    
uproc_family d[2];
    
  for (iter = 0; iter < 2; iter++)
  {
  d[iter] = 0;
  }
    
char dest[100];
    
p1 = s;
  while (isspace(*p1)) 
  {
  p1++;
  }
    
p2 = strpbrk(p1, ", \f\n\r\t\v");
  if (p2) 
  {
  len = p2 - p1;
  memcpy(dest, p2+1, len-1);
  p1 = p1 + len + 1;
  dest[len-1] = '\0';
  //printf("|%s|\t%lu\n", dest,len);
  iter = 0;
  d[iter] = atoi(dest);
  iter++;
  
    while (p2) 
    {
    p2 = strpbrk(p1, ", \f\n\r\t\v");
      if (p2)
      {
      len = p2 - p1+1;
      memcpy(dest, p2+1, len-1);
      p1 = p1 + len;
      dest[len-1] = '\0';
      //printf("\t|%s|\t%lu\n", dest,len);
      d[iter] = atoi(dest);
      //printf("%s\n", dest);
      //printf("%d\n", atoi(p1));
      iter++;
      
      }
    }
  len = strlen(p1);
  }
    
p1 = s;
  while (isspace(*p1)) 
  {
  p1++;
  }
    
p2 = strpbrk(p1, ", \f\n\r\t\v");
    
  if (p2) 
  {
  len = p2 - p1 + 1;
  *p2 = '\0';  
  }

  else 
  {
  len = strlen(p1);
  }
    
    memmove(s, p1, len + 1);
    
    struct return_struct return_val;
    
    return_val.s = s;
    return_val.xdim = d[0];
    return_val.ydim = d[1];
    
    return return_val;
}


//Heiner Insert
unsigned int **
taxamat(char *filename, int yDim, int xDim)
{
char *p1, *p2;
char dest[100];
FILE *fp;
char *line = NULL;
size_t len = 0;
ssize_t read;
long unsigned int iter;
unsigned int ** TaxArray;

TaxArray = (unsigned int**)calloc(xDim,sizeof(unsigned int*));

  if(NULL == TaxArray)
  {
  printf("Konnte keinen Speicher bereitstellen...\n");
  exit(0);
  }
	   
  for (iter = 0; iter < xDim; iter++)
  {
  TaxArray[iter] = (unsigned int*)calloc(yDim,sizeof(unsigned int));
    if(NULL == TaxArray[iter])
    {
    printf("Konnte keinen Speicher bereitstellen...\n");
    exit(0);
    }
  }

fp = fopen(filename, "r"); //read taxonomy matrix
fseek(fp, 0, SEEK_SET);

iter = 0;
int linelength = 1000;
line = malloc(linelength + 1 * sizeof(char));

  //while ((read = getline(&line, &len, fp)) != -1) 
  while(fgets (line, linelength, fp) != NULL)
  {
  //printf("Retrieved line of length %zu :\n", read);
  //printf("%s", line);
		
  p1 = line;
  p2 = strpbrk(p1, ", \f\n\r\t\v");
  int nPos = 0;
    while (p2) 
    {      
    p2 = strpbrk(p1, ", \f\n\r\t\v");
	  
      if (p2)
      {
      len = p2 - p1;
      memcpy(dest, p1, len);
      dest[len] = '\0';
      //printf("Komma gefunden: %d\t%c\t%lu\n", atoi(dest),*p1,len);
      //printf("%lu\t%d\n", iter,nPos);
      TaxArray[iter][nPos] = (unsigned int)atoi(dest);
      nPos++;
      p1 = p1 + len + 1;
      }

    }
  iter++;
  }
	   
free(line);
fclose(fp);
return(TaxArray);
}




//Heiner Insert



static char *
reverse_string(char *s)
{
    char *p1, *p2;
    p1 = s;
    p2 = s + strlen(s) - 1;
    while (p2 > p1) {
        char tmp = *p2;
        *p2 = *p1;
        *p1 = tmp;
        p1++;
        p2--;
    }
    return s;
}

static int
extract_uniques(uproc_io_stream *stream, const uproc_alphabet *alpha,
                uproc_idmap *idmap, unsigned int *xArray, unsigned int *yArray, unsigned int **TaxArray, uproc_amino first, bool reverse,
                struct ecurve_entry **entries, size_t *n_entries)
{
    int res;

    uproc_seqiter *rd;
    struct uproc_sequence seq;
    size_t index;

    uproc_bst *tree;
    union uproc_bst_key tree_key;

    uproc_family family;
    uproc_family x_Dim, y_Dim;
    
    
    struct return_struct return_val;
fprintf(stdout,"uproc_bst_create\t%lu\n",sizeof family);
    tree = uproc_bst_create(UPROC_BST_WORD, sizeof family);
    if (!tree) {
        return -1;
    }
    fprintf(stdout,"uproc_seqiter_create\n");
    rd = uproc_seqiter_create(stream);
    if (!rd) {
        res = -1;
        goto error;
    }

    
    
int r_val,i,j,xDim,qDim;

qDim = -1;

    while (res = uproc_seqiter_next(rd, &seq), !res) {
        uproc_worditer *iter;
        struct uproc_word fwd_word = UPROC_WORD_INITIALIZER,
                          rev_word = UPROC_WORD_INITIALIZER;
        uproc_family tmp_family, tmp_x_Dim, tmp_y_Dim, tmp_cur;
//fprintf(stdout,"\tcrop_first_word\t");
	
	int test_iter;

	
	//char seq_test[] = seq.header;
// 	test_iter = strlen(seq.header);
// hackz	
	//printf("Length of |%s| is |%d|\n", seq.header, test_iter);
	
        return_val = crop_first_word(seq.header);
	seq.header = return_val.s;
// hackz	
// 	test_iter = strlen(seq.header);
// 	printf("Length of |%s| is |%d|\t%u\n", seq.header, test_iter, return_val.xdim);
	//crop_second_word(seq_test);
	//fprintf(stdout,"uproc_idmap_family\t");
        family = uproc_idmap_family(idmap, seq.header);
	x_Dim = return_val.xdim;
	y_Dim = return_val.ydim;
	
	//printf("x_Dim: %d\n", x_Dim);
	//printf("y_Dim: %d\n", y_Dim);
        if (family == UPROC_FAMILY_INVALID) {
            res = -1;
            break;
        }

        if (reverse) {
            reverse_string(seq.data);
        }
	
        iter = uproc_worditer_create(seq.data, alpha);

        if (!iter) {
            res = -1;
            break;
        }
//fprintf(stdout,"uproc_worditer_next\n");
        while (res = uproc_worditer_next(iter, &index, &fwd_word, &rev_word),
               !res) {
           if (!uproc_word_startswith(&fwd_word, first)) {
                continue;
            }
            tree_key.word = fwd_word;
/*	printf("\tx_Dim: %d\n", x_Dim);
	printf("\ty_Dim: %d\n", y_Dim); */  
            res = uproc_bst_get(tree, tree_key, &tmp_family, &tmp_x_Dim, &tmp_y_Dim);
// 	printf("\t\ttmp_x_Dim: %d\n", tmp_x_Dim);
// 	printf("\t\ttmp_y_Dim: %d\n", tmp_y_Dim);
// 	printf("\t\tres: %d\n",res);
            /* word was already present -> mark as duplicate if stored class
             * differs */
            if (!res) {
                if (tmp_family != family) {
                    filtered_counts[family] += 1;
                    if (tmp_family != UPROC_FAMILY_INVALID & tmp_family != 0) {
                        filtered_counts[tmp_family] += 1;
                    }
		    
		      if (y_Dim > tmp_y_Dim)
		      {
		      tmp_cur = y_Dim;
		      }
		      
		      else
		      {
		      tmp_cur = tmp_y_Dim;
		      }
		      
		      while (tmp_cur <= 7)
		      {
		      //check if they are the same
			if (TaxArray[xArray[tmp_x_Dim]][tmp_cur] == TaxArray[xArray[x_Dim]][tmp_cur] & TaxArray[xArray[x_Dim]][tmp_cur] != 0)
			{
			break; 
			}

			if (TaxArray[xArray[tmp_x_Dim]][tmp_cur] == 0 | TaxArray[xArray[x_Dim]][tmp_cur] == 0)
			{
			  if (TaxArray[xArray[tmp_x_Dim]][tmp_cur+1] == TaxArray[xArray[x_Dim]][tmp_cur+1] & TaxArray[xArray[x_Dim]][tmp_cur+1] != 0)
			  {
			    if (TaxArray[xArray[tmp_x_Dim]][tmp_cur] == 0)
			    {
			    tmp_x_Dim = x_Dim;  
			    }
			  tmp_cur++;  
			  break;
			  }
			}
		      tmp_cur++;
		      }
		    tmp_family = 0;//UPROC_FAMILY_INVALID;//0;
		    //tmp_x_Dim = 0;
		    tmp_y_Dim = tmp_cur;
		    
		    if (qDim < tmp_y_Dim)
		    {
		    qDim = tmp_y_Dim;
 		    printf("tmp_x_Dim: %d\t%u\t", tmp_x_Dim,xArray[tmp_x_Dim]);
 		    printf("tmp_y_Dim: %d\t%u\n", tmp_y_Dim,yArray[tmp_y_Dim]);		    
		    }
		    

                    res = uproc_bst_update(tree, tree_key, &tmp_family, &xArray[tmp_x_Dim], &yArray[tmp_y_Dim]);
		    
                }
                
                else if (tmp_family == family)
		{
		      if (y_Dim > tmp_y_Dim)
		      {
		      tmp_cur = y_Dim;
		      }
		      
		      else
		      {
		      tmp_cur = tmp_y_Dim;
		      }
		      
		      while (tmp_cur <= 7)
		      {
		      //check if they are the same
			if (TaxArray[xArray[tmp_x_Dim]][tmp_cur] == TaxArray[xArray[x_Dim]][tmp_cur] & TaxArray[xArray[x_Dim]][tmp_cur] != 0)
			{
			break; 
			}

			if (TaxArray[xArray[tmp_x_Dim]][tmp_cur] == 0 | TaxArray[xArray[x_Dim]][tmp_cur] == 0)
			{
			  if (TaxArray[xArray[tmp_x_Dim]][tmp_cur+1] == TaxArray[xArray[x_Dim]][tmp_cur+1] & TaxArray[xArray[x_Dim]][tmp_cur+1] != 0)
			  {
			  tmp_x_Dim = x_Dim; 
			  tmp_cur++;
			  break;
			  }
			}
		      tmp_cur++;
		      }
		      
		    //tmp_family = 0;//UPROC_FAMILY_INVALID;//0;
		    tmp_y_Dim = tmp_cur;
                    res = uproc_bst_update(tree, tree_key, &tmp_family, &xArray[tmp_x_Dim], &yArray[tmp_y_Dim]);
		}
                
                
            }
            else if (res == UPROC_BST_KEY_NOT_FOUND) {
	      //fprintf(stdout,"<%d\t",family);
		//printf("x_Dim: %d\t%u\t%p\n", x_Dim,xArray[x_Dim],&xArray[x_Dim]);
		//printf("y_Dim: %d\t%u\t%p\n", y_Dim,yArray[y_Dim],&yArray[y_Dim]);
                res = uproc_bst_update(tree, tree_key, &family, &xArray[x_Dim], &yArray[y_Dim]);
		//fprintf(stdout,">\n");
            }
            if (res) {
                break;
            }
        }
        //fprintf(stdout,"<\t");
        uproc_worditer_destroy(iter);
	//fprintf(stdout,">\n");
        if (res < 0) {
            break;
        }
    }

    uproc_seqiter_destroy(rd);
    if (res == -1) {
        goto error;
    }

    uproc_bstiter *iter;
    iter = uproc_bstiter_create(tree);
    if (!iter) {
        res = -1;
        goto error;
    }
    *n_entries = 0;
    while (!uproc_bstiter_next(iter, &tree_key, &family, &x_Dim, &y_Dim)) {
        if (family != UPROC_FAMILY_INVALID) {
            *n_entries += 1;
        }
    }

    *entries = malloc(*n_entries * sizeof **entries);
    if (!*entries) {
        res = uproc_error(UPROC_ENOMEM);
        goto error;
    }

    uproc_bstiter_destroy(iter);

    iter = uproc_bstiter_create(tree);
    if (!iter) {
        res = -1;
        goto error;
    }
    struct ecurve_entry *entries_insert = *entries;
    while (!uproc_bstiter_next(iter, &tree_key, &family, &x_Dim, &y_Dim)) {
        if (family != UPROC_FAMILY_INVALID) {
	    //fprintf(stdout,"a\t");
            entries_insert->word = tree_key.word;
	    //fprintf(stdout,"b\t");
            entries_insert->family = family;
	    //fprintf(stdout,"c\n");
	    //printf("family...%d\tx_Dim: %d\ty_Dim: %d\n",family, x_Dim, y_Dim);
	    //printf("family...%p\tx_Dim: %p\ty_Dim: %p\n",&family, &x_Dim, &y_Dim);
	    entries_insert->tax_X = x_Dim;
	    entries_insert->tax_Y = y_Dim;
	    
            entries_insert++;
	    //printf("family...%d\tx_Dim: %d\ty_Dim: %d\n",family, x_Dim, y_Dim);
        }
    }
    uproc_bstiter_destroy(iter);

    res = 0;
error:
    uproc_bst_destroy(tree);

    return res;
}


static size_t
filter_singletons(struct ecurve_entry *entries, size_t n, const uproc_alphabet *alpha, unsigned int **TaxArray)
{
    size_t i, j, k;
    uproc_substmat *substmat;
    substmat = uproc_substmat_load(UPROC_IO_GZIP, "%s/substmat", "/home/hklingen/DB/PFAM/Comet/model/model/");
    unsigned char *types = calloc(n, sizeof *types);
    
    double minSimilarScore = 0.25;
    int lastFam = 0;
    
    enum
    {
        SINGLE,
        CLUSTER,
        BRIDGED,
        CROSSOVER
    };
  double dist[UPROC_SUFFIX_LEN];
  double sum;


  
  sum = 0.0;
fprintf(stdout,"Starting Singelton detection\n");   
    for (i = 0; i < n; i++) {
        struct ecurve_entry *e = &entries[i];
        unsigned char *t = &types[i];


      if (i < n - 1 )
      {
      /*compare e[0] to bottom e[1]*/
      uproc_substmat_align_suffixes(substmat, e[0].word.suffix, e[1].word.suffix,dist);
      sum = 0;
      
	for (k=0; k<12; k++)
	{
	sum = sum + dist[k];
	}
	
      
	//family is the same
	if (e[0].family == e[1].family)
	{
	  //score is similar enough
	  if (minSimilarScore < sum)
	  {
	  t[0] = CLUSTER;
	  t[1] = CLUSTER;
	  lastFam = e[1].family;
	  }	  
	}


	
	//family is different
	else
	{
	  
	  if (minSimilarScore < sum)
	  {
	  e[0].family = 0;
	  t[0] = CLUSTER;
	  e[1].family = 0;
	  t[1] = CLUSTER;
	  }   
	}
      }
    }
    fprintf(stdout,"Finished Singelton detection\n");
    
    fprintf(stdout,"Starting Cluster detection\n");

    for (i = k = 0; i < n; i++) {
        if (types[i] == CLUSTER || types[i] == BRIDGED) {
            entries[k] = entries[i];
            k++;
        }
        else {
            filtered_counts[entries[i].family] += 1;
        }
    }
fprintf(stdout,"Finished Cluster detection\n");

    free(types);
    return k;
}


static int
insert_entries(uproc_ecurve *ecurve, struct ecurve_entry *entries,
               size_t n_entries)
{
    int res = 0;
    size_t i;
    uproc_prefix current_prefix;
    uproc_list *suffix_list;
    struct uproc_ecurve_suffixentry suffix_entry;

    suffix_list = uproc_list_create(sizeof suffix_entry);
    if (!suffix_list) {
        return -1;
    }

    current_prefix = entries[0].word.prefix;

    for (i = 0; i < n_entries; i++) {
        if (entries[i].word.prefix != current_prefix) {
	  //fprintf(stderr, "%u\n", entries[i].word.suffix);
            res = uproc_ecurve_add_prefix(ecurve, current_prefix, suffix_list);
            if (res) {
                goto error;
            }
            uproc_list_clear(suffix_list);
            current_prefix = entries[i].word.prefix;
        }
        suffix_entry.suffix = entries[i].word.suffix;
        suffix_entry.family = entries[i].family;
	suffix_entry.xDim = entries[i].tax_X;
	suffix_entry.yDim = entries[i].tax_Y;
    //printf("i: %lu\t%u\t%u\t%u\n",i,suffix_entry.family,suffix_entry.xDim,suffix_entry.yDim);
	//printf("\tdeath?\n");
        res = uproc_list_append(suffix_list, &suffix_entry);
	//printf("\tdeath.\n");
        if (res) {
            goto error;
        }
    }
    //printf("\tdeath?\n");
    res = uproc_ecurve_add_prefix(ecurve, current_prefix, suffix_list);
    //printf("\tdeath.\n");
error:
    uproc_list_destroy(suffix_list);
    return res;
}


static int
build_ecurve(const char *infile,
             const char *alphabet,
             uproc_idmap *idmap,
	     unsigned int *xArray,
	     unsigned int *yArray,
	     unsigned int **TaxArray,
             bool reverse,
             uproc_ecurve **ecurve)
{
    int res;
    uproc_io_stream *stream;
    struct ecurve_entry *entries = NULL;
    size_t n_entries;
    uproc_amino first;
    uproc_alphabet *alpha;
printf("going to create alphabet now.\n");
    alpha = uproc_alphabet_create(alphabet);
    if (!alpha) {
        return -1;
    }
printf("going to create ecurve now.\n");
    *ecurve = uproc_ecurve_create(alphabet, 0);
    if (!*ecurve) {
        goto error;
    }

    progress(uproc_stderr, reverse ? "rev.ecurve" : "fwd.ecurve", -1.0);
    for (first = 0; first < UPROC_ALPHABET_SIZE; first++) {
        n_entries = 0;
        free(entries);
        stream = uproc_io_open("r", UPROC_IO_GZIP, infile);
        if (!stream) {
            res = -1;
            goto error;
        }
        progress(uproc_stderr, NULL, first * 100 / UPROC_ALPHABET_SIZE);
fprintf(stdout,"Uniques\n");	
        res = extract_uniques(stream, alpha, idmap, xArray, yArray, TaxArray,first, reverse,
                              &entries, &n_entries);
        uproc_io_close(stream);
fprintf(stdout,"...done\n");
        if (res) {
            goto error;
        }
fprintf(stdout,"Singletons\n");
        n_entries = filter_singletons(entries, n_entries, alpha, TaxArray);  //added alpha, TaxMat
        if (!n_entries) {
            continue;
        }
fprintf(stdout,"...done\n");
fprintf(stdout,"Insert\n");
        res = insert_entries(*ecurve, entries, n_entries);
        if (res) {
            goto error;
        }
fprintf(stdout,"...done\n");
    }
    fprintf(stdout,"<\t");
    uproc_ecurve_finalize(*ecurve);
    fprintf(stdout,">\n");
    

    
    progress(uproc_stderr, NULL, first * 100 / UPROC_ALPHABET_SIZE);

    if (0) {
error:
        if (first) {
            fputc('\n', stderr);
            uproc_ecurve_destroy(*ecurve);
            *ecurve = NULL;
        }
    }
    fprintf(stdout,"<a\t");
    uproc_alphabet_destroy(alpha);
    fprintf(stdout,">\n");
    fprintf(stdout,"<free\t");
    free(entries);
    fprintf(stdout,">\n");
    return res;
}

static int
build_and_store(const char *infile, const char *outdir, const char *alphabet,
                uproc_idmap *idmap, unsigned int *xArray, unsigned int *yArray, unsigned int **TaxArray, bool reverse)
{
    int res;
    uproc_ecurve *ecurve = NULL;
    res = build_ecurve(infile, alphabet, idmap, xArray, yArray, TaxArray, reverse, &ecurve);
    if (res) {
        return res;
    }
    fprintf(stderr, "Storing %s/%s.ecurve...", outdir, reverse ? "rev" : "fwd");
    res = uproc_ecurve_store(ecurve, UPROC_ECURVE_BINARY, UPROC_IO_GZIP,
                             "%s/%s.ecurve", outdir, reverse ? "rev" : "fwd");
    fprintf(stderr, " trying to destroy.");
    uproc_ecurve_destroy(ecurve);
    fprintf(stderr, " Done.");
    return res;
}

int
build_ecurves(const char *infile,
              const char *outdir,
              const char *alphabet,
              uproc_idmap *idmap,
	      unsigned int *xArray,
	      unsigned int *yArray,
	      unsigned int **TaxArray
 	    )
{
    int res;
    res = build_and_store(infile, outdir, alphabet, idmap, xArray, yArray, TaxArray, false);
    if (res) {
        return res;
    }
    fprintf(stdout,"reverse now...\n");
    res = build_and_store(infile, outdir, alphabet, idmap, xArray, yArray, TaxArray, true);
    return res;
}
