+ parser works now with mmap and sscanf
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 19 Dec 2007 16:26:28 +0000 (16:26 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 19 Dec 2007 16:26:28 +0000 (16:26 +0000)
TODO
+ perform some logical checks -> orientation...

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7178 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/data_tools/filterReads.c

index 6044142..9398454 100644 (file)
@@ -17,6 +17,8 @@ int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
 
 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes);
 
+void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size);
+
 static char *info = "Usage is:\n./filterReads gff reads";
 
 int main(int argc, char* argv[]) {
@@ -93,144 +95,225 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
       printf("mmapping failed!\n");
 
    void* linePtr = reads_area;
-
-   char *test_string = malloc(sizeof(char)*10);
-   strncpy(test_string,(char*)reads_area,10);
-   
-   status = munmap(reads_area,mmapAreaSize);
-   if(status != 0)
-      printf("munmap failed!\n");
-
-   void** upstream_end;
-   void** upstream_overlap;
-
-   void** downstream_start;
-   void** downstream_overlap;
-   
+   char* current_line = malloc(sizeof(char)*256);
+
+   int SIZE = 500;
+   // initialize boundary arrays
+   void** upstream_end      = malloc(sizeof(void*)*SIZE);
+   void** upstream_overlap  = malloc(sizeof(void*)*SIZE);
+   void** downstream_start  = malloc(sizeof(void*)*SIZE);
+   void** downstream_overlap= malloc(sizeof(void*)*SIZE);
    int ue,uo,ds,dov;
-      
-   int SIZE = 50;
+   ue = uo = ds = dov = 0;
 
    int skippedLinesCounter = 0;
 
-   printf("Entering loop...\n");
+   int prev_exon_stop = -1;
+   int cur_exon_start = -1;
 
    int gene_idx;
    for (gene_idx = 0; gene_idx < numGenes;) {
       struct gene* currentGene = (*allGenes)[gene_idx];
 
       int exon_idx;
-      for(exon_idx=1;exon_idx<currentGene->num_exons-1;exon_idx++) {
-         int prev_exon_stop = currentGene->exon_stops[exon_idx-1];
-         int cur_exon_start = currentGene->exon_starts[exon_idx];
+      for(exon_idx=1;exon_idx<currentGene->num_exons;exon_idx++) {
+
+         combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov);
+         combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds);
+
+         prev_exon_stop = currentGene->exon_stops[exon_idx-1];
+         cur_exon_start = currentGene->exon_starts[exon_idx];
 
          if (cur_exon_start - prev_exon_stop < 6)
             continue;
 
-         // initialize boundary arrays
-         upstream_end      = malloc(sizeof(void*)*SIZE);
-         upstream_overlap  = malloc(sizeof(void*)*SIZE);
-         downstream_start  = malloc(sizeof(void*)*SIZE);
-         downstream_overlap= malloc(sizeof(void*)*SIZE);
-         ue = 0;
-         uo = 0;
-         ds = 0;
-         dov = 0;
-
-         printf("Fetching reads line...\n");
+         ue = uo = ds = dov = 0;
 
          label:
 
-         status = sscanf((char*)linePtr,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+         current_line = strncpy(current_line,linePtr,256);
+         if (strcmp(current_line,"") == 0)
+            goto end;
+
+         status = sscanf(current_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
          &chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity);
          if (status < 12) {
+            //printf("skipped line is %s\n",current_line);
             skippedLinesCounter++;
-            continue;
          }
 
-         printf("read line!");
+         //printf("%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+         //chr,pos,seq,id,strand,mismatch,repetition,size,cut,prb,cal_prb,chastity);
+         //printf("read position: %d gene start/stop: %d-%d\n ",pos,currentGene->start,currentGene->stop);
 
          if (currentGene->start <= pos && pos+35 <= currentGene->stop) {
-            printf("gene boundaries: %d %d, pos: %d\n",currentGene->start,currentGene->stop,pos);
-
+            //printf("gene boundaries: %d %d, pos: %d linePtr %d\n",currentGene->start,currentGene->stop,pos,linePtr);
             // upstream ending
             if (pos+35 == prev_exon_stop) {
                upstream_end[ue] = linePtr;
                ue++;
-            }
+            } else {
 
-            // upstream overlapping
-            if (prev_exon_stop - pos <= 30) {
-               upstream_overlap[uo] = linePtr;
-               uo++;
+               // upstream overlapping
+               if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) {
+                  upstream_overlap[uo] = linePtr;
+                  uo++;
+               }
             }
 
             // downstream starting
             if (pos == cur_exon_start) {
                downstream_start[ds] = linePtr;
                ds++;
-            }
+            } else {
 
-            // downstream overlapping
-            if (cur_exon_start - pos >= 6) {
-               downstream_overlap[dov] = linePtr;
-               dov++;
-            }
-
-            while (*(char*)linePtr != '\n') {
-               printf("linePtr is %d",linePtr);
-               linePtr++;
+               // downstream overlapping
+               if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) {
+                  downstream_overlap[dov] = linePtr;
+                  dov++;
+               }
             }
 
+            while (*(char*)linePtr != '\n') linePtr++;
             linePtr++;
             goto label;
 
-            free(upstream_end);
-            free(upstream_overlap);
-            free(downstream_start);
-            free(downstream_overlap);
-         }
+         } else {
 
-         while (*(char*)linePtr != '\n') {
-            printf("linePtr is %d",linePtr);
+            if (currentGene->stop < pos) {
+               //printf("read is too far downstream\n");
+               break;
+            }
+
+            //printf("read is too far upstream\n");
+            while (*(char*)linePtr != '\n') linePtr++;
             linePtr++;
+            goto label;
          }
-
-         linePtr++;
-         goto label;
-         
       }
 
       gene_idx++;
    }
 
+   end:
+
+   free(upstream_end);
+   free(upstream_overlap);
+   free(downstream_start);
+   free(downstream_overlap);
+
    printf("skipped %d lines.\n",skippedLinesCounter);
 
+   status = munmap(reads_area,mmapAreaSize);
+   if(status != 0)
+      printf("munmap failed!\n");
+
    free(seq);
    free(prb);
    free(cal_prb);
    free(chastity);
 }
 
-//      int skippedLinesCounter = 0;
-//      while(1) {
-//         status = fscanf(reads_fs,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",&chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity);
-//         if(status == EOF)
-//            break;
-//
-//         if (status < 12) {
-//            skippedLinesCounter++;
-//            continue;
-//         }
-//
-//         label:
-//
-//         if (pos < currentGene->start)
-//            continue;
-//
-//         if (currentGene->stop < pos) {
-//            gene_idx++;
-//            currentGene = (*allGenes)[gene_idx];
-//            goto label;
-//         }
-//
+void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size) {
+   //printf("up_/down_size %d %d\n",up_size,down_size);
+
+   if (up_size == 0 || down_size == 0 || exon_stop == -1)
+      return;
+
+   int up_idx, down_idx, status;
+   char* upstream_line = malloc(sizeof(char)*256);
+   char* downstream_line = malloc(sizeof(char)*256);
+
+   int buffer_size= 64;
+
+   int up_chr        = 0;
+   int up_pos        = 0;
+   char* up_seq      = malloc(sizeof(char)*buffer_size);
+   int up_id         = 0;
+   char up_strand    = ' ';
+   int up_mismatch   = 0;
+   int up_repetition = 0;
+   int up_sz       = 0;
+   int up_cut        = 0;
+   char* up_prb      = malloc(sizeof(char)*buffer_size);
+   char* up_cal_prb  = malloc(sizeof(char)*buffer_size);
+   char* up_chastity = malloc(sizeof(char)*buffer_size);
+
+   int down_chr        = 0;
+   int down_pos        = 0;
+   char* down_seq      = malloc(sizeof(char)*buffer_size);
+   int down_id         = 0;
+   char down_strand    = ' ';
+   int down_mismatch   = 0;
+   int down_repetition = 0;
+   int down_sz         = 0;
+   int down_cut        = 0;
+   char* down_prb      = malloc(sizeof(char)*buffer_size);
+   char* down_cal_prb  = malloc(sizeof(char)*buffer_size);
+   char* down_chastity = malloc(sizeof(char)*buffer_size);
+
+   int new_chr        = 0;
+   int new_pos        = 0;
+   char* new_seq      = malloc(sizeof(char)*buffer_size);
+   int new_id         = 0;
+   char new_strand    = ' ';
+   char* new_prb        = malloc(sizeof(char)*buffer_size);
+   char* new_cal_prb    = malloc(sizeof(char)*buffer_size);
+   char* new_chastity   = malloc(sizeof(char)*buffer_size);
+
+   up_idx=0;
+   down_idx=0;
+   while(1) {
+      //printf("up_/down_idx %d %d\n",up_idx,down_idx);
+
+      if (up_idx == up_size || down_idx == down_size)
+         break;
+
+      strncpy(upstream_line,upstream[up_idx],256);
+      status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+      &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_repetition,&up_sz,
+      &up_cut,up_prb,up_cal_prb,up_chastity);
+      
+      strncpy(downstream_line,downstream[down_idx],256);
+      status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+      &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_repetition,&down_sz,
+      &down_cut,down_prb,down_cal_prb,down_chastity);
+
+      new_prb[0] = '\0';
+      new_cal_prb[0] = '\0';
+      new_chastity[0] = '\0';
+         
+      // is downstream overlapping
+      if (up_pos+35 == exon_stop) {
+         int overlap = exon_start - down_pos;
+         //printf("overlap is %d\n",overlap);
+         //printf("pos are: %d %d\n",up_pos,down_pos);
+
+         strncat(new_prb,up_prb+(36-overlap),overlap);
+         strncat(new_cal_prb,up_cal_prb+(36-overlap),overlap);
+         strncat(new_chastity,up_chastity+(36-overlap),overlap);
+
+         strncat(new_prb,down_prb+overlap,36-overlap);
+         strncat(new_cal_prb,down_cal_prb+overlap,36-overlap);
+         strncat(new_chastity,down_chastity+overlap,36-overlap);
+
+         //printf("up_prb: %s\n",up_prb);
+         //printf("down_prb: %s\n",down_prb);
+         //printf("new_prb: %s\n",new_prb);
+
+      // is upstream overlapping
+      } else {
+
+      }
+         
+      up_idx++;
+      down_idx++;
+   }
+}
+
+/*
+ * TODO
+ * - Check strand
+ * - check repetition
+ *
+ */