+ resolved an issue with the memory mapping in filterReads
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 6 Feb 2008 22:04:30 +0000 (22:04 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 6 Feb 2008 22:04:30 +0000 (22:04 +0000)
+ added more checks in filterReads

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7697 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/Configuration.py
qpalma/SIQP.py
qpalma/tools/parseGff.py
scripts/compile_dataset.py
scripts/qpalma_main.py
tools/data_tools/Makefile
tools/data_tools/filterReads.c
tools/data_tools/filterReads.py
tools/data_tools/parser.c

index 6cf2fe7..907872b 100644 (file)
@@ -18,7 +18,7 @@ fixedParamQ = cPickle.load(open('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/
 # The parameters for the QPalma algorithm
 #
 #
-C = 1
+C = 1000
 
 min_intron_len = 20
 max_intron_len = 2000
index e2dfa85..2f1f767 100644 (file)
@@ -73,10 +73,10 @@ class SIQP:
       numQPlifs = Configuration.numQualPlifs 
 
       #lengthGroupParam  = 5*1e-9
-      #spliceGroupParam  = 1e-8
+      #spliceGroupParam  = 1e-9
 
-      lengthGroupParam  = 5*1e-9
-      spliceGroupParam  = 1e-9
+      lengthGroupParam  = 1.0
+      spliceGroupParam  = 1.0
 
       self.P[0:lengthSP,0:lengthSP] *= lengthGroupParam
 
@@ -86,8 +86,10 @@ class SIQP:
 
       regC = self.numFeatures / (1.0*self.numExamples)
 
+      cfactor = 1.0
+
       #cfactor = 1e-8
-      cfactor = 5e-9
+      #cfactor = 5e-9
 
       for j in range(lengthSP-1):
          self.P[j,j+1]    = -cfactor
@@ -95,8 +97,7 @@ class SIQP:
          self.P[j,j]     += cfactor
          self.P[j+1,j+1] += cfactor
 
-      cfactor = 1e-9
-      #cfactor = 5e-9
+      #cfactor = 1e-9
 
       for j in range(lengthSP,lengthSP+donSP-1):
          cfactor = spliceGroupParam
@@ -148,7 +149,6 @@ class SIQP:
       #matchGroupParam   *= ratio2 
       #qualityGroupParam *= ratio2 
 
-
       beg = lengthSP+donSP+accSP
       end = lengthSP+donSP+accSP+mmatrixSP
       self.P[beg:end,beg:end]       *= matchGroupParam
@@ -156,7 +156,7 @@ class SIQP:
       beg = lengthSP+donSP+accSP+mmatrixSP
       self.P[beg:-self.numExamples,beg:-self.numExamples]   *= qualityGroupParam
 
-      self.P[0:-self.numExamples,0:-self.numExamples]       *= regC*0.1
+      self.P[0:-self.numExamples,0:-self.numExamples]       *= 1.0
 
 
    def createRegularizer(self):
index 8e63918..281a8b9 100644 (file)
@@ -14,6 +14,7 @@ def parse_gff(gff_filename):
    reader = csv.reader(gff_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
 
    allGenes = {}
+   allGenesA = []
    currentGene = None
 
    rx = re.compile('ID=[^;]*;',re.DOTALL)
@@ -33,6 +34,7 @@ def parse_gff(gff_filename):
       if id == 'gene':
          if currentGene != None:
             allGenes[currentGene.id] = currentGene
+            allGenesA.append(currentGene)
          
          desc = rx.search(desc).group()[3:-1]
          #print desc
@@ -81,6 +83,7 @@ def parse_gff(gff_filename):
       elif id == 'pseudogene':
          if currentGene != None:
             allGenes[currentGene.id] = currentGene
+            allGenesA.append(currentGene)
             currentGene = None
       else:
          assert False, 'Error: Unknown identifier \'%s\'' % id
@@ -88,7 +91,7 @@ def parse_gff(gff_filename):
    if currentGene != None:
       allGenes[currentGene.id] = currentGene
 
-   return allGenes
+   return allGenes,allGenesA
 
 
 def createGffPickle(annotFile,pickleFile):
index c4f3b88..aec5fc5 100644 (file)
@@ -191,12 +191,15 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
 
    # if we perform testing we cut a much wider region as we want to detect
    # how good we perform on a region
+   import random
+   ADD_POS = random.randint(1,20)
+
    if test:
-      up_cut = up_cut-500
+      up_cut = up_cut-ADD_POS
       if up_cut < 0:
          up_cut = 0
 
-      down_cut = down_cut+500
+      down_cut = down_cut+ADD_POS
 
       if down_cut > len(currentSeq):
          down_cut = len(currentSeq)
index bff6258..66bbe82 100644 (file)
@@ -509,7 +509,7 @@ class QPalma:
       print '##### Prediction on the test set #####'
       self.predict(param_filename,beg,end)
    
-      pdb.set_trace()
+      #pdb.set_trace()
 
    def predict(self,param_filename,beg,end):
       self.logfh = open('_qpalma_predict.log','w+')
index 992bdbf..5ea3362 100644 (file)
@@ -1,3 +1,5 @@
+CC=gcc
+CFLAGS=-O3 -ggdb -Wall -Wshadow -lm
 
 SRCS= parser.c\
        sort_genes.c\
@@ -5,10 +7,8 @@ SRCS= parser.c\
 
 OBJS = $(SRCS:%.cpp=%.o)
 
-FLAGS=-O3 -Wall -Wshadow -lm
-
 all: $(OBJS)
-       gcc $(FLAGS) -o filterReads $(OBJS) filterReads.c
+       $(CC) $(CFLAGS) -o filterReads $(OBJS) filterReads.c
 
 clean:
        rm -rf *.o
index 5022c45..be348b7 100644 (file)
@@ -13,6 +13,7 @@
 #include <unistd.h>
 #include <math.h>
 
+#include "debug_tools.h"
 #include "datastructures.h"
 
 #define _FILE_OFFSET_BITS == 64
@@ -29,7 +30,6 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out
 
 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
 
-//int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
 int fitting(char* up_prb, char* down_prb);
 
 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
@@ -97,15 +97,34 @@ int main(int argc, char* argv[]) {
 
    printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
 
-   // some entries in the gff files are not in a sorted order
-   //printf("Sorting genes...\n");
-   //sort_genes(&allGenes,numGenes);
-   //qsort(allGenes,numGenes,sizeof(struct gene*),compare_gene_struct);
-   ///printf("Genes were sorted!\n");
-   
+   // check if allGenes is sorted. if not throw away those genes that do not
+   // occur in the sorted order
+   int g_idx;
+   struct gene* currentGene = 0;
+   int nulled_genes=0;
+   int old_gene_stop = -1;
+   for(g_idx=0;g_idx<numGenes;g_idx++) {
+      currentGene = allGenes[g_idx];
+
+      if (! (currentGene->start < currentGene->stop))
+         printf("Invalid positions for gene %s!\n",currentGene->id);
+
+      if (! (old_gene_stop <  currentGene->start ) ) {
+         old_gene_stop = currentGene->stop;
+         allGenes[g_idx] = 0;
+         nulled_genes++;
+         continue;
+      }
+      old_gene_stop = currentGene->stop;
+   }
+
+   printf("Found %d unordered genes.\n",nulled_genes);
    int gidx, eidx;
    int exon_cov = 0;
    for(gidx=0;gidx<numGenes;gidx++) {
+      if (allGenes[gidx] == 0)
+         continue;
+
       for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
          exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
    }}
@@ -150,7 +169,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
    int numReads = reads_filesize / 178.0;
 
-   void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
+   void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
    if (reads_area == MAP_FAILED) {
       perror("mmap");
       exit(EXIT_FAILURE);
@@ -158,17 +177,22 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    close(reads_fid);
    printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
 
-   void* linePtr = reads_area;
-   char* current_line = malloc(sizeof(char)*256);
+   char* lineBeginPtr = (char*) reads_area;
+   char* lineEndPtr = (char*) reads_area;
+   char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
+
+   while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
+   lineEndPtr++;
+
+   char* current_line = malloc(sizeof(char)*512);
+   memset(current_line,0,512);
 
    int SIZE = 500;
    // initialize boundary arrays
-   void** upstream_end      = malloc(sizeof(void*)*SIZE);
    void** upstream_overlap  = malloc(sizeof(void*)*SIZE);
-   void** downstream_start  = malloc(sizeof(void*)*SIZE);
    void** downstream_overlap= malloc(sizeof(void*)*SIZE);
-   int ue,uo,ds,dov;
-   ue = uo = ds = dov = 0;
+   int uov,dov;
+   uov = dov = 0;
 
    int skippedLinesCounter = 0;
 
@@ -176,14 +200,17 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    int prev_exon_stop = -1;
    int cur_exon_start = -1;
 
-   current_line = strncpy(current_line,linePtr,256);
+   unsigned long line_size = lineEndPtr - lineBeginPtr;
+   //printf("distance is %lu\n",line_size);
+   strncpy(current_line,lineBeginPtr,line_size);
+   current_line[line_size] = '\0';
+   //printf("%s test",current_line);
+
    int gene_idx = 0;
    int exon_idx = 1;
    struct gene* currentGene = (*allGenes)[gene_idx];
    char* gene_id = currentGene->id;
 
-   //char* disamb_seq = malloc(sizeof(char)*read_size);
-
    int skippedReadCtr = 0;
    int uselessReadCtr = 0;
    int exonicReadCtr = 0;
@@ -204,9 +231,6 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       if (readCtr != 0 && readCtr % 1000000 == 0)
          printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
 
-      //if (gene_idx >= 1833)
-      //   printf("currentGene start/stop: %d/%d. Positions is %d\n",currentGene->start,currentGene->stop,pos);
-
       status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
       &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
       if (status < 12) {
@@ -215,20 +239,25 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       }
 
       // if the read is occurring several times elsewhere then get rid of it 
-      //if(!(occurrence >= 1 && occurrence <= 25)) {
       if ( occurrence != 1 ) {
          multioccurReadCtr++;
          goto next_read;
       }
 
+      FA(currentGene != 0);
+
       if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
 
          if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
             gene_idx++;
             exon_idx = 1;
             currentGene = (*allGenes)[gene_idx];
+            while( currentGene == 0 && gene_idx < numGenes) {
+               currentGene = (*allGenes)[gene_idx];
+               gene_idx++;
+            }
             //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
-            ue = uo = ds = dov = 0;
+            uov = dov = 0;
             continue;
          }
 
@@ -236,11 +265,13 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
             skippedReadCtr++;
             
             next_read:
-
-            while (*(char*)linePtr != '\n') linePtr++;
-            linePtr++;
+            
+            lineBeginPtr = lineEndPtr;
+            while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
+            lineEndPtr++;
             readCtr += 1;
-            current_line = strncpy(current_line,linePtr,256);
+            current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
+            current_line[lineEndPtr-lineBeginPtr] = '\0';
             continue;
          }
 
@@ -252,6 +283,10 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
             gene_idx++;
             exon_idx = 1;
             currentGene = (*allGenes)[gene_idx];
+            while( currentGene == 0 && gene_idx < numGenes) {
+               currentGene = (*allGenes)[gene_idx];
+               gene_idx++;
+            }
             continue;
          }
 
@@ -259,28 +294,18 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          prev_exon_stop = currentGene->exon_stops[exon_idx-1];
          cur_exon_start = currentGene->exon_starts[exon_idx];
 
-         //printf("exon %d %d inton til %d pos %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
-            
          if (cur_exon_start - prev_exon_stop < min_overlap || cur_exon_start < pos ) { // go to next exon
-            if (ue != 0 && dov != 0)
-               combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id,exon_idx);
 
-            if (uo != 0 && ds != 0) 
-               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id,exon_idx);
+            if (uov != 0 && dov != 0)
+               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
 
             exon_idx++;
-            ue = uo = ds = dov = 0;
+            uov = dov = 0;
             goto exon_label;
          }
 
          if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
             exonicReadCtr++;
-
-   // Removed exonic reads
-
-            //remove_ambiguities(seq,strlen(seq),disamb_seq);
-            //fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",chr,strand,disamb_seq,read_size,prb,cal_prb,chastity);
-
             goto next_read;
          }
 
@@ -288,31 +313,20 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
             goto next_read;
 
          // if this position is reached the read is somehow overlapping or
-         // exactly on the exon boundary. now determine where exactly:
-         if (pos + (read_size-1) == prev_exon_stop) {  // read ends at previous exon end
-            endPrevCtr++;
-            upstream_end[ue] = linePtr;
-            ue++;
-            goto next_read;
-         }
-
-         if ( (prev_exon_stop - pos) >= min_overlap && (prev_exon_stop - pos) <= max_overlap) { // read overlaps with previous exon boundary
+         // exactly on the exon boundary.
+         if ( (prev_exon_stop - pos) >= min_overlap && (prev_exon_stop - pos) < read_size) { // read overlaps with previous exon boundary
+            //printf("%s\n",current_line);
             prevOverlapCtr++;
-            upstream_overlap[uo] = linePtr;
-            uo++;
+            upstream_overlap[uov] = lineBeginPtr;
+            uov++;
             goto next_read;
          }
-
-         if ( pos == cur_exon_start ) { // read starts at current exon start
-            currentStartCtr++;
-            downstream_start[ds] = linePtr;
-            ds++;
-            goto next_read;
-         }
-
-         if ( (cur_exon_start - pos) >= min_overlap && (cur_exon_start - pos) <= max_overlap) { // read overlaps with current exon boundary
+      
+         int end_pos = pos+read_size-1;
+         if ( (end_pos - cur_exon_start) >= min_overlap && (end_pos - cur_exon_start) < read_size) { // read overlaps with current exon boundary
+            //printf("%s\n",current_line);
             currentOverlapCtr++;
-            downstream_overlap[dov] = linePtr;
+            downstream_overlap[dov] = lineBeginPtr;
             dov++;
             goto next_read;
          }
@@ -322,12 +336,9 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       }
    }
 
-   combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id,exon_idx);
-   combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id,exon_idx);
+   combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
 
-   free(upstream_end);
    free(upstream_overlap);
-   free(downstream_start);
    free(downstream_overlap);
 
    printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
@@ -350,8 +361,10 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 }
 
 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs,const char* gene_id, int exon_idx) {
+
    //printf("up/down size is %d/%d\n",up_size,down_size);
-   
+
+   /*
    int up_idx, down_idx, status;
    char* upstream_line = malloc(sizeof(char)*256);
    char* downstream_line = malloc(sizeof(char)*256);
@@ -494,6 +507,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    free(new_prb);
    free(new_cal_prb);
    free(new_chastity);
+   */
 }
 
 /*
index 71646c5..93fb93d 100644 (file)
@@ -7,8 +7,68 @@ import pdb
 import qpalma
 from qpalma.tools.parseGff import parse_gff
 
+class Read:
+
+   def __init__(self):
+      pass
+
+def createRead(line):
+   chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity = line.split()
+   newRead = Read()
+   newRead.chr = int(chr)
+   newRead.pos = int(pos)
+   newRead.seq = seq
+
+   if strand == 'D':
+      newRead.strand = '+'
+   if strand == 'P':
+      newRead.strand = '-'
+
+   newRead.mismatch = int(mismatch)
+   newRead.occurrence = int(occurrence)
+   newRead.size = int(size)
+
+   newRead.prb = prb
+   newRead.cal_prb = cal_prb
+   newRead.chastity = chastity
+
+   return newRead
+
+
 def filter(gff_file,reads,out):
-   allGenes = parse_gff(gff_file)
+   read_size = 36
+
+   d,allGenes = parse_gff(gff_file)
+   del d
+
+   print 'Found %d genes.' % len(allGenes)
+
+   old_gene_pos = -1
+   for g_idx in range(len(allGenes)):
+      currentGene = allGenes[g_idx]
+
+      if not old_gene_pos < currentGene.start:
+         allGenes[g_idx] = 'nil'
+
+      old_gene_pos = currentGene.stop
+   
+   allGenes = [e for e in allGenes if e != 'nil']
+
+   print '%d genes remained.' % len(allGenes)
+
+   read_fh = open(reads)
+
+   for currentGene in allGenes:
+      currentGene.overlappingReads = []
+      currentRead = createRead(read_fh.next())
+
+      while currentGene.start < currentRead.pos and currentRead.pos+read_size-1 < currentGene.stop:
+         currentGene.overlappingReads.append(currentRead)
+         currentRead = createRead(read_fh.next())
+
+   #for currentGene in allGenes:
+   #   for (exon_start,exons_stop) in currentGene.exons:
+
 
    out_fh = open(out,'w+')
 
index b19d9db..79fe130 100644 (file)
@@ -58,7 +58,6 @@ int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
       exit(EXIT_FAILURE);
    }
 
-
    // do one pass through the gff line to determine the number of
    // genes
    int numGenes = 0;