+ new version of the filterReads program
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 7 Feb 2008 11:47:17 +0000 (11:47 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 7 Feb 2008 11:47:17 +0000 (11:47 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7701 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/data_tools/datastructures.c
tools/data_tools/datastructures.h
tools/data_tools/filterReads.c

index 2f6e6fd..fedb7aa 100644 (file)
@@ -3,6 +3,64 @@
 
 #include "datastructures.h"
 
+Read* read_alloc(int _size) {
+   Read* newRead = (Read*) malloc(sizeof(struct read));
+   newRead->chr         = 0;
+   newRead->pos         = 0;
+   newRead->seq         = 0;
+   newRead->id          = 0;
+   newRead->strand      = ' ';
+   newRead->mismatch    = 0;
+   newRead->occurrence  = 0;
+   newRead->size        = _size;
+   newRead->cut         = 0;
+   newRead->prb         = 0; 
+   newRead->cal_prb     = 0; 
+   newRead->chastity    = 0; 
+
+   return newRead;
+}
+
+void free_read(Read* oldRead) {
+   if(oldRead->seq != 0)
+      free(oldRead->seq);
+   if(oldRead->prb != 0)
+      free(oldRead->prb);
+   if(oldRead->cal_prb != 0)
+      free(oldRead->cal_prb);
+   if(oldRead->chastity != 0)
+      free(oldRead->chastity);
+}
+
+Read* create_read(int chr, int pos, char* seq, unsigned long id, char strand, int mismatch,
+int occurrence, int size, int cut, char* prb, char* cal_prb, char* chastity) {
+
+   Read* newRead = read_alloc(size);
+
+   newRead->chr         = chr;
+   newRead->pos         = pos;
+   newRead->seq         = malloc(sizeof(char)*(size+1));
+   newRead->seq         = strncpy(newRead->seq,seq,size);
+   newRead->seq[size] = '\0';
+   newRead->id          = id;
+   newRead->strand      = strand;
+   newRead->mismatch    = mismatch;
+   newRead->occurrence  = occurrence;
+   newRead->size        = size;
+   newRead->cut         = cut;
+   newRead->prb         = malloc(sizeof(char)*(size+1));
+   newRead->prb         = strncpy(newRead->prb,prb,size);
+   newRead->prb[size] = '\0';
+   newRead->cal_prb     = malloc(sizeof(char)*(size+1));
+   newRead->cal_prb     = strncpy(newRead->cal_prb,cal_prb,size);
+   newRead->cal_prb[size] = '\0';
+   newRead->chastity    = malloc(sizeof(char)*(size+1));
+   newRead->chastity    = strncpy(newRead->chastity,chastity,size);
+   newRead->chastity[size] = '\0';
+
+   return newRead;
+}
+
 struct gene* gene_alloc(void) {
    struct gene* newGene = (struct gene*) malloc(sizeof(struct gene));
    newGene->exon_starts = malloc(2*sizeof(int));
index fa04070..c166136 100644 (file)
@@ -1,6 +1,27 @@
 #ifndef __DATA_STRUCTURES_H__
 #define __DATA_STRUCTURES_H__
 
+typedef struct read {
+   int chr;
+   int pos;
+   char* seq;
+   unsigned long id;
+   char strand;
+   int mismatch;
+   int occurrence;
+   int size;
+   int cut;
+   char* prb;
+   char* cal_prb;
+   char* chastity;
+} Read;
+
+Read* read_alloc(int _size);
+void free_read(Read* oldRead);
+
+Read* create_read(int chr, int pos, char* seq, unsigned long id, char strand, int mismatch,
+int occurrence, int size, int cut, char* prb, char* cal_prb, char* chastity);
+
 struct gene {
    int start;
    int stop;
index 69fb2bb..bac1324 100644 (file)
@@ -28,7 +28,9 @@ void sort_genes(struct gene*** allGenes, int numGenes);
 
 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
 
-void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
+void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
+
+int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
 
 int fitting(char* up_prb, char* down_prb);
 
@@ -200,8 +202,8 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
    int SIZE = 500;
    // initialize boundary arrays
-   void** upstream_overlap  = malloc(sizeof(void*)*SIZE);
-   void** downstream_overlap= malloc(sizeof(void*)*SIZE);
+   Read** upstream_overlap  = malloc(sizeof(Read*)*SIZE);
+   Read** downstream_overlap= malloc(sizeof(Read*)*SIZE);
    int uov,dov;
    uov = dov = 0;
 
@@ -231,6 +233,9 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    int currentOverlapCtr = 0;
    int multioccurReadCtr = 0;
 
+   Read* currentRead;
+   int up_idx, down_idx;
+
    int readCtr = 0;
    // start of the parsing loop 
    while(1) {
@@ -262,6 +267,17 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
             if (uov != 0 && dov != 0)
                combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
+
+            for(up_idx=0;up_idx<uov;up_idx++) {
+               free_read(upstream_overlap[up_idx]);
+               free(upstream_overlap[up_idx]);
+            }
+
+            for(down_idx=0;down_idx<dov;down_idx++) {
+               free_read(downstream_overlap[down_idx]);
+               free(downstream_overlap[down_idx]);
+            }
+
             uov = dov = 0;
 
             gene_idx++;
@@ -314,6 +330,16 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
             if (uov != 0 && dov != 0)
                combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
+
+            for(up_idx=0;up_idx<uov;up_idx++) {
+               free_read(upstream_overlap[up_idx]);
+               free(upstream_overlap[up_idx]);
+            }
+
+            for(down_idx=0;down_idx<dov;down_idx++) {
+               free_read(downstream_overlap[down_idx]);
+               free(downstream_overlap[down_idx]);
+            }
             uov = dov = 0;
 
             exon_idx++;
@@ -333,7 +359,8 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          if ( (prev_exon_stop - pos) >= min_overlap && (prev_exon_stop - pos) < read_size) { // read overlaps with previous exon boundary
             //printf("%s\n",current_line);
             prevOverlapCtr++;
-            upstream_overlap[uov] = lineBeginPtr;
+            currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
+            upstream_overlap[uov] = currentRead;
             uov++;
             goto next_read;
          }
@@ -342,7 +369,8 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          if ( (end_pos - cur_exon_start) >= min_overlap && (end_pos - cur_exon_start) < read_size) { // read overlaps with current exon boundary
             //printf("%s\n",current_line);
             currentOverlapCtr++;
-            downstream_overlap[dov] = lineBeginPtr;
+            currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
+            downstream_overlap[dov] = currentRead;
             dov++;
             goto next_read;
          }
@@ -354,6 +382,18 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
    combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
 
+   for(up_idx=0;up_idx<uov;up_idx++) {
+      free_read(upstream_overlap[up_idx]);
+      free(upstream_overlap[up_idx]);
+   }
+
+   for(down_idx=0;down_idx<dov;down_idx++) {
+      free_read(downstream_overlap[down_idx]);
+      free(downstream_overlap[down_idx]);
+   }
+
+   uov = dov = 0;
+
    free(upstream_overlap);
    free(downstream_overlap);
 
@@ -376,207 +416,142 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    free(chastity);
 }
 
-void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs,const char* gene_id, int exon_idx) {
+void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
 
    printf("up/down size is %d/%d\n",up_size,down_size);
 
-   int up_idx, down_idx, status;
-   char* upstream_line = malloc(sizeof(char)*512);
-   char* downstream_line = malloc(sizeof(char)*512);
-   int p_start, p_stop;
-
-   int buffer_size= 64;
-
-   int up_chr        = 0;
-   int up_pos        = 0;
-   char* up_seq      = malloc(sizeof(char)*buffer_size);
-   unsigned long up_id         = 0;
-   char up_strand    = ' ';
-   int up_mismatch   = 0;
-   int up_occurrence = 0;
-   int up_sz       = 0;
-   int up_cut        = 0;
-   char* up_prb      = malloc(sizeof(char)*buffer_size);
-   char* up_cal_prb  = malloc(sizeof(char)*buffer_size);
-   char* up_chastity = malloc(sizeof(char)*buffer_size);
-
-   int down_chr        = 0;
-   int down_pos        = 0;
-   char* down_seq      = malloc(sizeof(char)*buffer_size);
-   unsigned long down_id         = 0;
-   char down_strand    = ' ';
-   int down_mismatch   = 0;
-   int down_occurrence = 0;
-   int down_sz         = 0;
-   int down_cut        = 0;
-   char* down_prb      = malloc(sizeof(char)*buffer_size);
-   char* down_cal_prb  = malloc(sizeof(char)*buffer_size);
-   char* down_chastity = malloc(sizeof(char)*buffer_size);
-
-   int new_chr        = 0;
-   char* new_seq      = malloc(sizeof(char)*buffer_size);
-   char new_strand    = ' ';
-   char* new_prb        = malloc(sizeof(char)*buffer_size);
-   char* new_cal_prb    = malloc(sizeof(char)*buffer_size);
-   char* new_chastity   = malloc(sizeof(char)*buffer_size);
-   char* new_up_seq = malloc(sizeof(char)*read_size);
-   char* new_down_seq = malloc(sizeof(char)*read_size); 
-
-   int overlap;
-   int fit;
-
-   char* lineBeginPtr;
-   char* lineEndPtr;
+   int up_idx, down_idx, overlap, success;
 
    char* up_used_flag = calloc(up_size,sizeof(char));
    char* down_used_flag = calloc(down_size,sizeof(char));
 
+   Read* currentUpRead;
+   Read* currentDownRead;
+
    for(up_idx=0;up_idx<up_size;up_idx++) {
       if( up_used_flag[up_idx] == 1)
          continue;
 
-      lineBeginPtr = upstream[up_idx];
-      lineEndPtr = lineBeginPtr;
-      while (*(char*)lineEndPtr != '\n') lineEndPtr++;
-      lineEndPtr++;
-      strncpy(upstream_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
-      upstream_line[lineEndPtr-lineBeginPtr] = '\0';
-
-      status = sscanf(upstream_line,line_format,&up_chr,&up_pos,up_seq,&up_id,
-      &up_strand,&up_mismatch,&up_occurrence,&up_sz,&up_cut,up_prb,up_cal_prb,up_chastity);
-
-      remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
+      currentUpRead = upstream[up_idx];
 
-      overlap = exon_stop - up_pos;
+      overlap = exon_stop - currentUpRead->pos;
 
       for(down_idx=0;down_idx<down_size;down_idx++) {
          if( down_used_flag[down_idx] == 1)
             continue;
 
-         lineBeginPtr = downstream[down_idx];
-         lineEndPtr = lineBeginPtr;
-         while (*(char*)lineEndPtr != '\n') lineEndPtr++;
-         lineEndPtr++;
-         strncpy(downstream_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
-         downstream_line[lineEndPtr-lineBeginPtr] = '\0';
+         currentDownRead = downstream[down_idx];
 
-         status = sscanf(downstream_line,line_format,
-         &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
-         &down_cut,down_prb,down_cal_prb,down_chastity);
-
-         if(up_strand != down_strand)
+         if(currentUpRead->strand != currentDownRead->strand)
             continue;
 
-         remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
-         
-         printf(line_format,
-         up_chr,up_pos,up_seq,up_id,up_strand,up_mismatch,up_occurrence,up_sz,
-         up_cut,up_prb,up_cal_prb,up_chastity);
-
-         printf(line_format,
-         down_chr,down_pos,down_seq,down_id,down_strand,down_mismatch,down_occurrence,down_sz,
-         down_cut,down_prb,down_cal_prb,down_chastity);
-
-         //new_seq[0] = '\0';
-         //new_prb[0] = '\0';
-         //new_cal_prb[0] = '\0';
-         //new_chastity[0] = '\0';
-         //int splitpos = 0;
+         success = join_reads(exon_stop,exon_start,currentUpRead,currentDownRead,out_fs,gene_id,exon_idx);
          
-         //
-         //fit = fitting(up_prb+(36-overlap)-6,down_prb+(exon_start-down_pos)-6);
-         //if (fit == -1)
-         //   continue;
-
-         //if (! (fit  < overlap ))
-         //   continue;
-
-         //new_chr     = up_chr;
-         //new_strand  = up_strand;
-         //splitpos = overlap;
-
-         //p_start = up_pos;
-         //p_stop =  exon_start+(read_size-overlap);
-
-         //strncat(new_seq,new_up_seq,overlap);
-         //strncat(new_prb,up_prb,overlap);
-         //strncat(new_cal_prb,up_cal_prb,overlap);
-         //strncat(new_chastity,up_chastity,overlap);
-
-         ////strncat(new_seq,new_down_seq+fit,read_size-overlap);
-         ////strncat(new_prb,down_prb+fit,read_size-overlap);
-         ////strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
-         ////strncat(new_chastity,down_chastity+fit,read_size-overlap);
-         //
-         //int offset = exon_start-down_pos;
-         //strncat(new_seq,new_down_seq+offset,read_size-overlap);
-         //strncat(new_prb,down_prb+offset,read_size-overlap);
-         //strncat(new_cal_prb,down_cal_prb+offset,read_size-overlap);
-         //strncat(new_chastity,down_chastity+offset,read_size-overlap);
-
-         //// The four last integers specify the positions of 
-         //// p_start : the position in the dna where the first read starts
-         //// exons_stop  : the position in the dna where the first exons ends
-         //// exons_start : the position in the dna where the second exons starts
-         //// p_stop  : the position in the dna where the (truncated) second read ends
-         //fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
-         //read_nr,new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
-         //read_nr++;
-
-         //combined_reads++;
-         //used_flag[down_idx] = 1;
+         if(success) {
+            up_used_flag[up_idx] = 1;
+            down_used_flag[down_idx] = 1;
+         }
+
       }
    }
 
-   free(upstream_line);
-   free(downstream_line);
-
    free(up_used_flag);
    free(down_used_flag);
-  
-   free(new_up_seq);
-   free(new_down_seq); 
+}
 
-   free(up_seq);
-   free(up_prb);
-   free(up_cal_prb);
-   free(up_chastity);
+int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx) {
+   int buffer_size = 64;
+    
+   char* new_seq        = malloc(sizeof(char)*buffer_size);
+   char* new_prb        = malloc(sizeof(char)*buffer_size);
+   char* new_cal_prb    = malloc(sizeof(char)*buffer_size);
+   char* new_chastity   = malloc(sizeof(char)*buffer_size);
 
-   free(down_seq);
-   free(down_prb);
-   free(down_cal_prb);
-   free(down_chastity);
+   char* new_up_seq     = malloc(sizeof(char)*read_size);
+   char* new_down_seq   = malloc(sizeof(char)*read_size); 
 
-   free(new_seq);
-   free(new_prb);
-   free(new_cal_prb);
-   free(new_chastity);
-}
+   // range of possible sequence length on exon side
+   int up_range   = exon_stop - up_read->pos+1;
+   int down_range = down_read->pos+(read_size-1) - exon_start+1;
 
-/*
-int fitting(char* up_prb, char* down_prb) {
-   int w_size = 3;
+   int down_offset = exon_start - down_read->pos +1;
+   int u_off, d_off, u_size, d_size;
 
-   double current_mean_up = 0;
-   double current_mean_down = 0;
+   printf("possible range %d %d\n",up_range,down_range);
 
-   int uidx;
-   for(uidx=0;uidx<w_size;uidx++) {
-      current_mean_up += up_prb[uidx]-50;
-      current_mean_down += down_prb[uidx]-50;
+   if(up_range+down_range < read_size)
+      return 0;
+
+   int half_read = read_size / 2;
+
+   int p_start, p_stop, offset;
+
+   // both overlap more than a half of the new read
+   if(up_range >= half_read &&  down_range >= half_read) {
+      printf("half\n");
+
+      u_size = half_read;
+      d_size = half_read;
+
+   } else {
+
+      if(up_range < down_range) {
+         u_size = up_range;
+         d_size = down_range - u_size;
+      }
+
+      if(up_range > down_range) {
+         d_size = down_range;
+         u_size = up_range - d_size;
+      }
    }
 
-   current_mean_up /= w_size;
-   current_mean_down /= w_size;
+   p_start  = exon_stop - u_size + 1;
+   p_stop   = exon_start + d_size - 1;
+
+   u_off = p_start - up_read->pos ;
+   d_off = exon_start - down_read->pos;
+
+   printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
+   printf("u/d size: %d %d\n",u_size,d_size);
+   printf("off/size u: %d %d\n",u_off,d_off);
+
+   remove_ambiguities(up_read->seq,read_size,new_up_seq);
+   remove_ambiguities(down_read->seq,read_size,new_down_seq);
+
+   strncpy(new_seq,new_up_seq+u_off,u_size);
+   strncpy(new_seq+u_size,new_down_seq+d_off,d_size);
+   new_seq[read_size] = '\0';
+
+   strncpy(new_seq, up_read->prb+u_off, u_size);
+   strncpy(new_seq+u_size, down_read->prb+d_off, d_size);
+   new_seq[read_size] = '\0';
+
+   strncpy(new_seq, up_read->cal_prb+u_off, u_size);
+   strncpy(new_seq+u_size, down_read->cal_prb+d_off, d_size);
+   new_seq[read_size] = '\0';
+
+   strncpy(new_seq, up_read->chastity+u_off, u_size);
+   strncpy(new_seq+u_size, down_read->chastity+d_off, d_size);
+   new_seq[read_size] = '\0';
+
+   //// The four last integers specify the positions of 
+   //// p_start : the position in the dna where the first read starts
+   //// exons_stop  : the position in the dna where the first exons ends
+   //// exons_start : the position in the dna where the second exons starts
+   //// p_stop  : the position in the dna where the (truncated) second read ends
+   fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
+   read_nr,up_read->chr,up_read->strand,new_seq,half_read-1,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
+   read_nr++;
 
-   double min_val = fmin(current_mean_up,current_mean_down);
-   double max_val = fmax(current_mean_up,current_mean_down);
-   if ( (min_val / max_val) <= 0.33 )
-      return 1;
+   free(new_up_seq);
+   free(new_down_seq); 
 
-   return -1;
+   free(new_seq);
+   free(new_prb);
+   free(new_cal_prb);
+   free(new_chastity);
 }
-*/
 
 int fitting(char* up_prb, char* down_prb) {
    double epsilon_mean = 30.0;
@@ -682,6 +657,8 @@ void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
       new_idx++;
    }
 
+   new_seq[new_idx] = '\0';
+
    if (new_idx != 36) {
       printf("Error: Sequence is not of length 36!\n");
       printf("old seq: %s\n",old_seq);
@@ -690,6 +667,15 @@ void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
    }
 }
 
+void print_read(Read* cRead) {
+
+   printf(line_format,
+   cRead->chr, cRead->pos, cRead->seq, cRead->id,
+   cRead->strand, cRead->mismatch, cRead->occurrence,
+   cRead->size, cRead->cut, cRead->prb, cRead->cal_prb,
+   cRead->chastity);
+}
+
 /*
  * TODO:
  * - Check strand -> done simple (only if equal)