+ updated filtering
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 17 Apr 2008 18:34:18 +0000 (18:34 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 17 Apr 2008 18:34:18 +0000 (18:34 +0000)
+ some minor changes in parsers and data processing

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8587 e1793c9e-67f9-0310-80fc-b846ff1f7b36

dyn_prog/Makefile
dyn_prog/test_fill_matrix.cpp
qpalma/DataProc.py
qpalma/parsers.py
scripts/PipelineHeuristic.py
tools/data_tools/Makefile
tools/data_tools/filterReads.c
tools/data_tools/parser.c
tools/data_tools/tools.c

index e8b348e..120d6a6 100644 (file)
@@ -22,7 +22,10 @@ OBJS = $(SRCS:%.cpp=%.o)
 #CXXFLAGS=-O3 -fPIC -pg -fprofile-arcs
 
 #CXXFLAGS=-O3 -fPIC -ggdb -pg
-CXXFLAGS=-O3 -fPIC -ggdb #-fno-inline
+#CXXFLAGS=-O3 -fPIC -ggdb -lprofiler #-fno-inline
+CXXFLAGS=-O3 -fPIC -g -ggdb 
+LDFLAGS=-lprofiler -L/fml/ag-raetsch/home/fabio/own_libs/libunwind/lib -lunwind-x86_64 -lunwind
+
 
 IF=QPalmaDP
 
@@ -33,7 +36,7 @@ all: $(OBJS) $(HDRS)
        @ python -c "import ${IF}"
 
 test: $(OBJS) $(HDRS) 
-       g++ $(CXXFLAGS) -o test_fm Mathmatics.o io.o penalty_info.o fill_matrix.o test_fill_matrix.cpp 
+       g++ $(CXXFLAGS) $(LDFLAGS) -o test_fm  debug_tools.o Mathmatics.o io.o penalty_info.o fill_matrix.o test_fill_matrix.cpp 
 
 clean:
        @ rm *.o *.so *.cxx ${IF}.py ${IF}.pyc
index 2e08e96..b6d47ae 100644 (file)
@@ -2,6 +2,8 @@
 #include <stdlib.h>
 #include <fstream>
 #include <sstream>
+#include <ctime>
+#include <sys/timeb.h>  
 using namespace std;
 
 #include "qpalma_dp.h"
@@ -158,8 +160,19 @@ int main(int argc, char** argv){
    int* max_score_positions = new int[nr_paths*2];
    mode currentMode = USE_QUALITY_SCORES;
 
+   //time_t startTime;
+   //time_t endTime;
+
    printf("calling fill_matrix\n");
-   fill_matrix(nr_paths, matrices, est_len, dna_len, est, dna, prb, &functions, matchmatrix, qualityScores, donor, acceptor, remove_duplicate_scores, nr_donor_sites, donor_sites, max_score_positions);
+
+   //time(&startTime);
+
+   for(idx=0;idx<1;idx++)
+      fill_matrix(nr_paths, matrices, est_len, dna_len, est, dna, prb, &functions, matchmatrix, qualityScores, donor, acceptor, remove_duplicate_scores, nr_donor_sites, donor_sites, max_score_positions);
+
+   //time(&endTime);
+
+   //printf ("Scan time: %6.3f\n", difftime(endTime,startTime));
 
    printf("End of test_fm \n");
 }
index 904d91c..047f2e7 100644 (file)
@@ -15,7 +15,7 @@ import sys
 from genome_utils import load_genomic
 from Genefinding import *
 
-def paths_load_data(filename,expt,genome_info,PAR,test=False):
+def paths_load_data(filename):
 
    #data = io_pickle.load(filename)
    data = cPickle.load(open(filename))
index f28892f..c22a75f 100644 (file)
@@ -41,7 +41,7 @@ class FilteredReadParser(ReadParser):
       read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
 
       """
-      id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
+      id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
       splitpos = int(splitpos)
       read_size = int(read_size)
 
@@ -65,10 +65,13 @@ class FilteredReadParser(ReadParser):
       exon_stop = int(exon_stop)
       exon_start = int(exon_start)
       p_stop = int(p_stop)
+      true_cut = int(true_cut)
 
       line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
       'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
-      'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start, 'p_stop':p_stop}
+      'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
+      'p_stop':p_stop,'true_cut':true_cut}
+
       return line_d
 
    def next(self):
@@ -211,3 +214,77 @@ class MapParser(ReadParser):
             entries[id] = old_entry
 
       return entries
+
+
+class PipelineReadParser(ReadParser):
+   """
+   This class offers a parser for the reads that are remapped by the vmatch
+   utility. 
+
+   According to the docu the entries are:
+   
+   ID, Chromosome, Position, Orientation (D or P), Mismatches, Alignment length, Offset, Alignment
+
+   """
+
+   def __init__(self,filename):
+      ReadParser.__init__(self,filename)
+
+   def parseLine(self,line):
+      """
+      We assume that a line has the following entries:
+
+      """
+
+      #id,chr,pos,strand,mismatches,length,offset,seq,prb,cal_prb,chastity,orig_seq,is_spliced = line.split()
+      id,chr,pos,strand,mismatches,length,offset,seq,prb,cal_prb,chastity = line.split()
+
+      chr   = int(chr)
+      pos   = int(pos)
+
+      if strand == 'D':
+         strand = '+'
+
+      if strand == 'P':
+         strand = '-'
+
+      mismatches  = int(mismatches)
+      length      = int(length)
+      offset      = int(offset)
+
+      seq=seq.lower()
+      #orig_seq=orig_seq.lower()
+
+      #if is_spliced == '1': 
+      #   is_spliced = True
+      #else:
+      #   is_spliced = False
+
+      line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
+      'mismatches':mismatches, 'length':length, 'offset':offset,\
+      'seq':seq,'prb':prb,'cal_prb':cal_prb,'chastity':chastity}
+      #'orig_seq':orig_seq,'is_spliced':is_spliced}
+
+      return line_d
+
+   def next(self):
+      for line in self.fh:
+         line = line.strip()
+         yield self.parseLine(line)
+
+      raise StopIteration
+
+   def parse(self):
+      entries = {}
+
+      for elem in self.fh:
+         line_d = self.parseLine(elem)
+         id = line_d['id']
+         try:
+            entries[id] = [line_d]
+         except:
+            old_entry = entries[id]
+            old_entry.append(line_d)
+            entries[id] = old_entry
+
+      return entries
index f6ef2ae..10b8fbe 100644 (file)
@@ -63,6 +63,8 @@ class PipelineHeuristic:
       run = cPickle.load(open(run_fname))
       self.run = run
 
+      start = cpu()
+
       self.data_fname = data_fname
 
       self.param = cPickle.load(open(param_fname))
@@ -121,7 +123,19 @@ class PipelineHeuristic:
       # total time spend for get seq and scores
       self.get_time  = 0.0
       self.calcAlignmentScoreTime = 0.0
+      self.alternativeScoresTime = 0.0
+
+      self.count_time = 0.0
+      self.read_parsing = 0.0
+      self.main_loop = 0.0
+      self.splice_site_time = 0.0
+      self.computeSpliceAlignWithQualityTime = 0.0
+      self.computeSpliceWeightsTime = 0.0
+      self.DotProdTime = 0.0
+      self.array_stuff = 0.0
+      stop = cpu()
 
+      self.init_time = stop-start
 
    def filter(self):
       """
@@ -129,18 +143,24 @@ class PipelineHeuristic:
       """
       run = self.run 
 
+      start = cpu()
+
       rrp = PipelineReadParser(self.data_fname)
       all_remapped_reads = rrp.parse()
 
+      stop = cpu()
+
+      self.read_parsing = stop-start
 
       ctr = 0
       unspliced_ctr  = 0
       spliced_ctr    = 0
 
       print 'Starting filtering...'
+      _start = cpu()
 
       for readId,currentReadLocations in all_remapped_reads.items():
-         for location in currentReadLocations:
+         for location in currentReadLocations[:1]:
 
             id       = location['id']
             chr      = location['chr']
@@ -193,6 +213,7 @@ class PipelineHeuristic:
 
             alternativeAlignmentScores = self.calcAlternativeAlignments(location)
 
+            start = cpu()
             # found no alternatives
             if alternativeAlignmentScores == []:
                continue
@@ -228,8 +249,12 @@ class PipelineHeuristic:
                else:
                   self.true_pos += 1
 
-
             ctr += 1
+            stop = cpu()
+            self.count_time = stop-start
+
+      _stop = cpu()
+      self.main_loop = _stop-_start
 
       print 'Unspliced/Splice: %d %d'%(unspliced_ctr,spliced_ctr)
       print 'True pos / false pos : %d %d'%(self.true_pos,self.false_pos)
@@ -283,52 +308,92 @@ class PipelineHeuristic:
       genomicSeq_start  = pos
       genomicSeq_stop   = pos+self.intron_size*2+self.read_size*2
 
+      start = cpu()
       currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+      stop = cpu()
+      self.get_time += stop-start
+
       dna   = currentDNASeq
 
+      start = cpu()
       alt_don_pos,acc_pos = self.findHighestScoringSpliceSites(currentAcc,currentDon)
+      stop = cpu()
+      self.splice_site_time = stop-start
 
       alternativeScores = []
 
+      exons          = zeros((2,2),dtype=numpy.int)
+      est            = unb_seq
+      original_est   = seq
+      quality        = prb
+
+      # inlined
+      h = self.h
+      d = self.d
+      a = self.a
+      mmatrix = self.mmatrix
+      qualityPlifs = self.qualityPlifs
+      # inlined
+      
+      _start = cpu()
       for don_pos in alt_don_pos:
-         exons          = zeros((2,2),dtype=numpy.int)
+         start = cpu()
+
          exons[0,0]     = 0
          exons[0,1]     = don_pos
          exons[1,0]     = acc_pos+1
          exons[1,1]     = acc_pos+1+(self.read_size-don_pos)
 
-         est            = unb_seq
-         original_est   = seq
-         quality        = prb
-
          _dna = dna[:int(exons[1,1])]
          _dna = _dna[:exons[1,0]] + orig_seq[don_pos:]
 
-         #pdb.set_trace()
-
          _currentAcc = currentAcc[:int(exons[1,1])]
-
-         #acc_mean = mean([e for e in _currentAcc if e != -inf])
-         #factor = 8.5
          _currentAcc = [0.25]*len(_currentAcc) 
 
          _currentDon = currentDon[:int(exons[1,1])]
-
-         #don_mean = mean([e for e in _currentAcc if e != -inf])
-         #factor = 2.5
          _currentDon = [0.25]*len(_currentDon)
 
-         #pdb.set_trace()
-
          currentVMatchAlignment = _dna, exons, est, original_est, quality,\
          _currentAcc, _currentDon
 
+         stop = cpu()
+         self.array_stuff += stop - start
+
          #alternativeScore = self.calcAlignmentScore(currentVMatchAlignment)
+         #alternativeScores.append(self.calcAlignmentScore(currentVMatchAlignment))
+
+         # Lets start calculation
+         dna, exons, est, original_est, quality, acc_supp, don_supp =\
+         currentVMatchAlignment
+
+         # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
+         trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
+         computeSpliceAlignWithQuality(dna, exons, est, original_est,\
+         quality, qualityPlifs, run)
+
+         stop = cpu()
+         self.computeSpliceAlignWithQualityTime += stop-start
          start = cpu()
-         alternativeScores.append(self.calcAlignmentScore(currentVMatchAlignment))
+
+         # Calculate the weights
+         trueWeightDon, trueWeightAcc, trueWeightIntron =\
+         computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+
          stop = cpu()
-         self.calcAlignmentScoreTime += stop-start
+         self.computeSpliceWeightsTime += stop-start
+
+         start = cpu()
+
+         trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
 
+         # Calculate w'phi(x,y) the total score of the alignment
+         alternativeScores.append((trueWeight.T * self.currentPhi)[0,0])
+
+         stop = cpu()
+         self.DotProdTime += stop-start
+
+      _stop = cpu()
+      self.alternativeScoresTime += _stop-_start
 
       return alternativeScores
 
@@ -396,7 +461,17 @@ if __name__ == '__main__':
    print 'total time elapsed: %f' % (stop-start)
    print 'time spend for get seq: %f' % ph1.get_time
    print 'time spend for calcAlignmentScoreTime: %f' %  ph1.calcAlignmentScoreTime
-
+   print 'time spend for alternativeScoresTime: %f' % ph1.alternativeScoresTime
+   print 'time spend for count time: %f' % ph1.count_time
+   print 'time spend for init time: %f' % ph1.init_time
+   print 'time spend for read_parsing time: %f' % ph1.read_parsing
+   print 'time spend for main_loop time: %f' % ph1.main_loop
+   print 'time spend for splice_site_time time: %f' % ph1.splice_site_time
+
+   print 'time spend for computeSpliceAlignWithQualityTime time: %f'% ph1.computeSpliceAlignWithQualityTime
+   print 'time spend for computeSpliceWeightsTime time: %f'% ph1.computeSpliceWeightsTime
+   print 'time spend for DotProdTime time: %f'% ph1.DotProdTime
+   print 'time spend forarray_stuff time: %f'% ph1.array_stuff
    #import cProfile
    #cProfile.run('ph1.filter()')
 
index 5602972..9fbe627 100644 (file)
@@ -4,6 +4,7 @@ CFLAGS=-O3 -ggdb -Wall -Wshadow -lm
 SRCS= parser.c\
        sort_genes.c\
        tools.c\
+       join.c\
        debug_tools.c\
        datastructures.c
 
@@ -15,5 +16,8 @@ all: $(OBJS)
 test:
        @ ./filterReads subset.gff subset.reads output 1
 
+join: $(OBJS)
+       @ $(CC) $(CFLAGS) -o test_join $(OBJS) main_join.c
+
 clean:
        @ rm -rf *.o
index 2df9192..b9300d2 100644 (file)
@@ -1,9 +1,15 @@
-////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // The purpose of this program is to read a gff and a    
 // solexa reads file and create a data set used by QPalma. 
 //
 //
-////////////////////////////////////////////////////////////
+//
+// Notes:
+//
+// - Both the read indices and the gff gene indices are one-based
+//
+//
+////////////////////////////////////////////////////////////////////////////////
 
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <math.h>
 
 #include "debug_tools.h"
+#include "join.h"
 #include "datastructures.h"
 
 #define _FILE_OFFSET_BITS == 64
 
-typedef struct tuple {
-   int first;
-   int second;
-} Tuple;
+const char* line_format = "%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n";
 
+const int read_size = 36;
 
-int compare_gene_struct(struct gene* a, struct gene* b) {
-   return a->stop - b->start;
-}
+const int min_overlap = 1;
+const int max_overlap = 35;
+
+unsigned long read_nr = 1;
+unsigned long unfiltered_read_nr = 1;
 
 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
 void sort_genes(struct gene*** allGenes, int numGenes);
-void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
+void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs, FILE* unfiltered_out_fs);
 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
 int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size);
-void remove_ambiguities(char * old_seq, char* new_seq);
+int remove_ambiguities(char * old_seq, char* new_seq);
 Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range);
 
 static char *info = "Usage is:\n./filterReads gff reads output";
@@ -44,125 +51,12 @@ static char *info = "Usage is:\n./filterReads gff reads output";
  *
  */
 
-#define _FDEBUG_ 0
-
-#define DEBUG_READ 38603
-
-const char* line_format = "%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n";
-
-const int read_size = 36;
-
-const int min_overlap = 1;
-const int max_overlap = 35;
-
-unsigned long read_nr = 1;
+//#define _FDEBUG_ 0
+//#define DEBUG_READ 38603
 
 int combined_reads = 0;
 
-
-int main(int argc, char* argv[]) {
-
-   if(argc != 5) {
-      printf("%s\n",info);
-      exit(EXIT_FAILURE);
-   }
-
-   int status;
-   int filenameSize = 256;
-   char* gff_filename = malloc(sizeof(char)*filenameSize);
-   char* reads_filename = malloc(sizeof(char)*filenameSize);
-   char* output_filename = malloc(sizeof(char)*filenameSize);
-
-
-   strncpy(gff_filename,argv[1],filenameSize);
-   strncpy(reads_filename,argv[2],filenameSize);
-   strncpy(output_filename,argv[3],filenameSize);
-
-   FILE *gff_fs = fopen(gff_filename,"r");
-   FILE *reads_fs = fopen(reads_filename,"r");
-   FILE *out_fs = fopen(output_filename,"w");
-
-   read_nr = strtoul(argv[4],NULL,10);
-   read_nr++;
-
-   if(gff_fs == NULL) {
-      printf("Error: Could not open file: %s",gff_filename);
-      exit(EXIT_FAILURE);
-   }
-
-   if(reads_fs == NULL) {
-      printf("Error: Could not open file: %s",reads_filename);
-      exit(EXIT_FAILURE);
-   }
-
-   if(out_fs == NULL) {
-      printf("Error: Could not open file: %s",output_filename);
-      exit(EXIT_FAILURE);
-   }
-
-   struct gene** allGenes;
-   int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
-   status = fclose(gff_fs);
-   free(gff_filename);
-   if(status != 0)
-      printf("closing of gff filestream failed!\n");
-
-   printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
-
-   // check if allGenes is sorted. if not throw away those genes that do not
-   // occur in the sorted order
-   int g_idx;
-   struct gene* currentGene = 0;
-   int nulled_genes=0;
-   int old_gene_stop = -1;
-   for(g_idx=0;g_idx<numGenes;g_idx++) {
-      currentGene = allGenes[g_idx];
-
-      if (! (currentGene->start < currentGene->stop))
-         printf("Invalid positions for gene %s!\n",currentGene->id);
-
-      if (! (old_gene_stop <  currentGene->start ) ) {
-         old_gene_stop = currentGene->stop;
-         allGenes[g_idx] = 0;
-         nulled_genes++;
-         continue;
-      }
-      old_gene_stop = currentGene->stop;
-   }
-
-   printf("Found %d unordered genes.\n",nulled_genes);
-   int gidx, eidx;
-   int exon_cov = 0;
-   for(gidx=0;gidx<numGenes;gidx++) {
-      if (allGenes[gidx] == 0)
-         continue;
-
-      for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
-         exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
-   }}
-   printf("Exon coverage is %f\n",(double)exon_cov/30432563);
-
-   process_reads(reads_fs,&allGenes,numGenes,out_fs);
-
-   for(g_idx=0;g_idx<numGenes;g_idx++) {
-      if(allGenes[g_idx] != 0)
-         free_gene(allGenes[g_idx]);
-         free(allGenes[g_idx]);
-
-   }
-   free(allGenes);
-
-   status = fclose(reads_fs);
-   status = fclose(out_fs);
-   if(status != 0)
-      perror("fclose");
-      
-   free(reads_filename);
-   free(output_filename);
-   return 0;
-}
-
-void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
+void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs, FILE* unfiltered_out_fs) {
    int status;
 
    int buffer_size= 64;
@@ -244,6 +138,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    int up_idx, down_idx;
 
    int readCtr = 0;
+   int read_start, read_stop;
    // start of the parsing loop 
    while(1) {
       if (gene_idx == numGenes || strcmp(current_line,"") == 0)
@@ -254,6 +149,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       if (readCtr != 0 && readCtr % 1000000 == 0)
          printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
 
+      // pos of the reads is one-based
       status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
       if (status < 12) {
          skippedLinesCounter++;
@@ -266,6 +162,10 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          goto next_read;
       }
 
+      // define read start and stop positions
+      read_start = pos;
+      read_stop  = pos + read_size-1;
+
       if( strand != 'D' )
          goto next_read;
 
@@ -273,7 +173,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
       FA(currentGene != 0);
 
-      if ( currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop) { // read is within gene borders
+      if ( currentGene->start <= read_start && read_stop <= currentGene->stop) { // read is within gene borders
 
          exon_label:
 
@@ -294,7 +194,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
          //printf("exon: %d %d %d\t pos: %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
 
-         if (cur_exon_start - prev_exon_stop < min_overlap || cur_exon_start < pos ) { // go to next exon
+         if ( (cur_exon_start - prev_exon_stop - 1) < min_overlap || cur_exon_start < read_start ) { // go to next exon
 
             if (uov != 0 && dov != 0)
                combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
@@ -315,30 +215,35 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
             goto exon_label;
          }
 
-         if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
+         if ( prev_exon_start <= read_start && read_stop <= prev_exon_stop ) { // read is inside previous exon
+
+            // output of unused i.e. unspliced reads
+            fprintf(unfiltered_out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
+            unfiltered_read_nr,chr,strand,seq,0,read_size,prb,cal_prb,chastity,gene_id,read_start,-1,-1,read_stop);
+            unfiltered_read_nr++;
+            
             exonicReadCtr++;
             goto next_read;
          }
 
-         if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
+         if ( read_stop < prev_exon_stop ) // go to next read
             goto next_read;
 
          // if this position is reached the read is somehow overlapping or
          // exactly on the exon boundary.
-         if ( (prev_exon_stop - pos) >= min_overlap && (prev_exon_stop - pos) <= max_overlap ) { // read overlaps with previous exon boundary
+         if ( (prev_exon_stop - read_start + 1) >= min_overlap && (prev_exon_stop - read_start + 1) <= max_overlap ) { // read overlaps with previous exon boundary
             //printf("%s\n",current_line);
             prevOverlapCtr++;
-            currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
+            currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
             upstream_overlap[uov] = currentRead;
             uov++;
             goto next_read;
          }
       
-         int end_pos = pos+read_size-1;
-         if ( (end_pos - cur_exon_start + 1) >= min_overlap && (end_pos - cur_exon_start + 1) <= max_overlap ) { // read overlaps with current exon boundary
+         if ( ( read_stop - cur_exon_start + 1) >= min_overlap && (read_stop - cur_exon_start + 1) <= max_overlap ) { // read overlaps with current exon boundary
             //printf("%s\n",current_line);
             currentOverlapCtr++;
-            currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
+            currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
             downstream_overlap[dov] = currentRead;
             dov++;
             goto next_read;
@@ -349,6 +254,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
       } else { // read is not within gene borders
 
+
          if (uov != 0 && dov != 0)
             combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
 
@@ -364,7 +270,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
          uov = dov = 0;
 
-         if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
+         if ( currentGene->stop < read_stop ) { // go to next gene
             gene_idx++;
             exon_idx = 1;
             currentGene = (*allGenes)[gene_idx];
@@ -376,7 +282,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
             continue;
          }
 
-         if ( pos < currentGene->start ) { // go to next read
+         if ( read_start < currentGene->start ) { // go to next read
             skippedReadCtr++;
             
             next_read:
@@ -429,10 +335,8 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    free(chastity);
 }
 
-void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
-
-   //printf("up/down size is %d/%d\n",up_size,down_size);
 
+void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
    int up_idx, down_idx, success;
 
    char* up_used_flag = calloc(up_size,sizeof(char));
@@ -463,7 +367,6 @@ void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, R
             up_used_flag[up_idx] = 1;
             down_used_flag[down_idx] = 1;
          }
-
       }
    }
 
@@ -471,10 +374,28 @@ void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, R
    free(down_used_flag);
 }
 
+/*
+ *
+ *
+ *
+ * ACGTACGTCA GTXXXXXXXXAG ACGTAGACGT
+ * p1       e1             e2       p2  
+ *
+ *
+ *
+ *
+ */
+
 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx) {
    // range of possible sequence length on exon side
-   int up_range   = exon_stop - up_read->pos;
-   int down_range = down_read->pos+(read_size-1) - exon_start+1;
+   int up_read_start = up_read->pos;
+   //int up_read_stop  = up_read->pos+read_size-1;
+
+   int down_read_start = down_read->pos;
+   int down_read_stop  = down_read->pos+read_size-1;
+
+   int up_range   = exon_stop - up_read_start + 1;
+   int down_range = down_read_stop - exon_start + 1;
    int retval;
 
    int u_size, d_size;
@@ -483,41 +404,37 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    if(up_range+down_range < read_size)
       return 0;
 
-   //if(up_range < down_range) {
    if (read_nr % 2 != 0) {
       d_size = down_range;
       u_size = read_size - d_size;
-   }
-
-   //if(up_range >= down_range) {
-   if (read_nr % 2 == 0) {
+   } else {
       u_size = up_range;
       d_size = read_size - u_size;
    }
 
-   const int p_start  = exon_stop - u_size;
+   if( u_size > up_range || d_size > down_range)
+      return 0;
+
+   const int p_start  = exon_stop  - u_size + 1;
    const int p_stop   = exon_start + d_size - 1;
 
-   const int u_off = p_start - up_read->pos;
-   const int d_off = exon_start - down_read->pos;
+   int u_off = p_start - up_read_start;
+   int d_off = exon_start - down_read_start;
+
+   FA(u_off >= 0);
+   FA(d_off >= 0);
+
+   FA( exon_stop - p_start + p_stop - exon_start + 2 == read_size);
+   FA( u_size + d_size == read_size );
 
-   FA(u_size != -1);
-   FA(d_size != -1);
-   FA(u_size + d_size == read_size);
+   // seems reasonable up to here
 
    int buf_size = 4*read_size;
-   char* new_up_seq     = malloc(sizeof(char)*buf_size);
-   char* new_down_seq   = malloc(sizeof(char)*buf_size); 
-   char* new_seq        = malloc(sizeof(char)*buf_size);
-
-   memset(new_up_seq,0,sizeof(char)*buf_size);
-   memset(new_down_seq,0,sizeof(char)*buf_size);
-   memset(new_seq,0,sizeof(char)*buf_size);
-   
-   strncpy(new_up_seq,up_read->seq,strlen(up_read->seq));
-   strncpy(new_down_seq,down_read->seq,strlen(down_read->seq));
-
-   Tuple jinfo = join_seq(new_seq,new_up_seq,u_off,u_size,new_down_seq,d_off,d_size,down_range);
+   char* new_seq = malloc(sizeof(char)*buf_size);
+   memset(new_seq,'z',sizeof(char)*buf_size);
+
+   Tuple jinfo = join_seq(new_seq,up_read->seq,u_off,u_size,down_read->seq,d_off,d_size,down_range);
+
    int cut_pos = jinfo.first;
    int additional_pos = jinfo.second;
 
@@ -527,6 +444,11 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    char* new_cal_prb    = malloc(sizeof(char)*buf_size);
    char* new_chastity   = malloc(sizeof(char)*buf_size);
 
+   if( jinfo.first == -1 ) {
+      retval = 0;
+      goto free;
+   }
+
    if( additional_pos > (down_range - d_size) ) {
       retval = 0;
       goto free;
@@ -544,32 +466,6 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size+additional_pos);
    new_chastity[read_size+additional_pos] = '\0';
 
-#ifdef _FDEBUG_
-   if(read_nr == DEBUG_READ) {
-      printf("read nr %lu",read_nr);
-      printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
-      printf("u/d range: %d %d\n",up_range,down_range);
-      printf("u/d size: %d %d\n",u_size,d_size);
-      printf("u/d off: %d %d\n",u_off,d_off);
-      printf("add_pos/down_range-d_size: %d %d\n",additional_pos,(down_range - d_size));
-
-      printf("******************\n");
-
-      printf("%s\n",up_read->seq);
-      printf("%s\n",new_up_seq);
-
-      printf("******************\n");
-
-      printf("%s\n",down_read->seq);
-      printf("%s\n",new_down_seq);
-
-      printf("******************\n");
-      printf("%s\n",new_seq);
-      printf("%s\n",new_prb);
-      printf("%s\n",new_cal_prb);
-      printf("%s\n",new_chastity);
-   }
-#endif // _FDEBUG_
 
    int status = 1; //fitting(up_read->prb,down_read->prb,u_off,d_off,u_size,d_size);
 
@@ -583,6 +479,9 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    //fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n",
    //   read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,up_read->seq,down_read->seq);
 
+   //fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
+   //   read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,u_size);
+
    fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
       read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,u_size);
 
@@ -590,9 +489,6 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    combined_reads++;
 
    free:
-   free(new_up_seq);
-   free(new_down_seq); 
-
    free(new_seq);
    free(new_prb);
    free(new_cal_prb);
@@ -602,192 +498,130 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
 }
 
 
-Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range) {
-   int new_idx, idx;
-   new_idx = idx = 0;
-   int elem_ctr = 0;
-   int read_del_ctr = 0;
-
-   int u_idx = 0;
-   int skipped_pos_ctr = 0;
-   while(1) {
-      if( u_idx >= read_size || skipped_pos_ctr == u_off) 
-         break;
-        
-      if(up_seq[u_idx] == '[') {
-         u_idx += 4;
-         skipped_pos_ctr++;
-      } else {
-         u_idx++;
-         skipped_pos_ctr++;
-      }
-   }
-
-   if(read_nr == DEBUG_READ) {
-      printf("u_off is %d\n",u_off);
-      printf("u_idx is %d\n",u_idx);
-   }
-   u_off = u_idx+1;
+void print_read(Read* cRead) {
+   printf(line_format,
+   cRead->chr, cRead->pos, cRead->seq, cRead->id,
+   cRead->strand, cRead->mismatch, cRead->occurrence,
+   cRead->size, cRead->cut, cRead->prb, cRead->cal_prb,
+   cRead->chastity);
+}
 
+/*
+ * The mainloop 
+ *
+ */
 
-   if( u_off > 0 && up_seq[u_off+idx-1] == '[' )
-      idx = -1;
+int main(int argc, char* argv[]) {
 
-   if( u_off > 1 && up_seq[u_off+idx-2] == '[' )
-      idx = -2;
+   if(argc != 7) {
+      printf("%s\n",info);
+      exit(EXIT_FAILURE);
+   }
 
-   if( u_off > 2 && up_seq[u_off+idx-3] == '[' )
-      idx = -3;
+   int status;
+   int filenameSize = 256;
+   char* gff_filename = malloc(sizeof(char)*filenameSize);
+   char* reads_filename = malloc(sizeof(char)*filenameSize);
+   char* output_filename = malloc(sizeof(char)*filenameSize);
+   char* unfiltered_output_filename = malloc(sizeof(char)*filenameSize);
 
-   while(1) {
-      if (elem_ctr == u_size)
-         break;
+   strncpy(gff_filename,argv[1],filenameSize);
+   strncpy(reads_filename,argv[2],filenameSize);
+   strncpy(output_filename,argv[3],filenameSize);
+   strncpy(unfiltered_output_filename,argv[4],filenameSize);
 
-      if(up_seq[u_off+idx] == '[') {
+   FILE *gff_fs = fopen(gff_filename,"r");
+   FILE *reads_fs = fopen(reads_filename,"r");
+   FILE *out_fs = fopen(output_filename,"w");
+   FILE *unfiltered_out_fs = fopen(unfiltered_output_filename,"w");
 
-         if(up_seq[u_off+idx+2] == '-') // del in read seq.
-            read_del_ctr++;
+   read_nr = strtoul(argv[5],NULL,10);
+   read_nr++;
 
-         while(1) { 
-            new_seq[new_idx] = up_seq[u_off+idx];
-            new_idx++;
-            idx++;
-            if(up_seq[u_off+idx-1] == ']')
-               break;
-         }
-         elem_ctr++;
-         continue;
-      }
+   unfiltered_read_nr = strtoul(argv[6],NULL,10);
+   unfiltered_read_nr++;
 
-      new_seq[new_idx] = up_seq[u_off+idx];
-      idx++;
-      new_idx++;
-      elem_ctr++;
+   if(gff_fs == NULL) {
+      printf("Error: Could not open file: %s",gff_filename);
+      exit(EXIT_FAILURE);
    }
 
-   elem_ctr = 0;
-   idx = 0;
-
-   Tuple tuple;
-   tuple.first = new_idx;
-
-   int d_idx = 0;
-   skipped_pos_ctr = 0;
-   while(1) {
-      if( d_idx >= read_size || skipped_pos_ctr == d_off) 
-         break;
-        
-      if(down_seq[d_idx] == '[') {
-         d_idx += 4;
-         skipped_pos_ctr++;
-      } else {
-         d_idx++;
-         skipped_pos_ctr++;
-      }
+   if(reads_fs == NULL) {
+      printf("Error: Could not open file: %s",reads_filename);
+      exit(EXIT_FAILURE);
    }
 
-   if(read_nr == DEBUG_READ) {
-      printf("d_off is %d\n",d_off);
-      printf("d_idx is %d\n",d_idx);
+   if(out_fs == NULL) {
+      printf("Error: Could not open file: %s",output_filename);
+      exit(EXIT_FAILURE);
    }
-   d_off = d_idx+1;
-   //if(read_nr == DEBUG_READ) {
-   //   printf("%s\n",new_seq);
-   //   printf("%c\n",down_seq[d_off+idx]);
-   //}
-
-   if( d_off > 0 && down_seq[d_off+idx-1] == '[' )
-      idx = -1;
 
-   if( d_off > 1 && down_seq[d_off+idx-2] == '[' )
-      idx = -2;
+   if(unfiltered_out_fs == NULL) {
+      printf("Error: Could not open file: %s",unfiltered_output_filename);
+      exit(EXIT_FAILURE);
+   }
 
-   if( d_off > 2 && down_seq[d_off+idx-3] == '[' )
-      idx = -3;
+   struct gene** allGenes;
+   int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
+   status = fclose(gff_fs);
+   free(gff_filename);
+   if(status != 0)
+      printf("closing of gff filestream failed!\n");
 
-   while(1) {
-      if (elem_ctr == d_size)
-         break;
+   printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
 
-      if(down_seq[d_off+idx] == '[') {
+   // check if allGenes is sorted. if not throw away those genes that do not
+   // occur in the sorted order
+   int g_idx;
+   struct gene* currentGene = 0;
+   int nulled_genes=0;
+   int old_gene_stop = -1;
+   for(g_idx=0;g_idx<numGenes;g_idx++) {
+      currentGene = allGenes[g_idx];
 
-         if(down_seq[d_off+idx+2] == '-') // del in read seq.
-            read_del_ctr++;
+      if (! (currentGene->start < currentGene->stop))
+         printf("Invalid positions for gene %s!\n",currentGene->id);
 
-         while(1) { 
-            new_seq[new_idx] = down_seq[d_off+idx];
-            new_idx++;
-            idx++;
-            if(down_seq[d_off+idx-1] == ']')
-               break;
-         }
-         elem_ctr++;
+      if (! (old_gene_stop <  currentGene->start ) ) {
+         old_gene_stop = currentGene->stop;
+         allGenes[g_idx] = 0;
+         nulled_genes++;
          continue;
       }
-
-      new_seq[new_idx] = down_seq[d_off+idx];
-      idx++;
-      new_idx++;
-      elem_ctr++;
-   }
-
-   elem_ctr = 0;
-
-#ifdef _FDEBUG_
-   if(read_nr == DEBUG_READ) {
-      printf("read_del_ctr %d\n",read_del_ctr);
-      printf("idx/new_idx: %d %d\n",idx,new_idx);
-      printf("next char: %c\n",down_seq[d_off+idx]);
-      printf("down_seq: %s\n",down_seq);
+      old_gene_stop = currentGene->stop;
    }
-#endif // _FDEBUG_
 
-   while(1) {
-      if (elem_ctr == read_del_ctr || d_off+idx >= strlen(down_seq))
-         break;
-
-      if(down_seq[d_off+idx] == '[') {
-
-         while(1) { 
-            new_seq[new_idx] = down_seq[d_off+idx];
-            new_idx++;
-            idx++;
-            if(down_seq[d_off+idx-1] == ']')
-               break;
-         }
-         elem_ctr++;
+   printf("Found %d unordered genes.\n",nulled_genes);
+   int gidx, eidx;
+   int exon_cov = 0;
+   for(gidx=0;gidx<numGenes;gidx++) {
+      if (allGenes[gidx] == 0)
          continue;
-      }
 
-#ifdef _FDEBUG_
-      if(read_nr == DEBUG_READ)
-         printf("current char: %c\n",down_seq[d_off+idx]);
-#endif // _FDEBUG_
-
-      new_seq[new_idx] = down_seq[d_off+idx];
-      new_idx++;
-      idx++;
-      elem_ctr++;
-   }
+      for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
+         exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
+   }}
+   printf("Exon coverage is %f\n",(double)exon_cov/30432563);
 
-   new_seq[new_idx+1] = '\0';
+   process_reads(reads_fs,&allGenes,numGenes,out_fs,unfiltered_out_fs);
 
-#ifdef _FDEBUG_
-   if(read_nr == DEBUG_READ)
-      printf("idx/new_idx: %d %d\n",idx,new_idx);
-#endif // _FDEBUG_
+   for(g_idx=0;g_idx<numGenes;g_idx++) {
+      if(allGenes[g_idx] != 0)
+         free_gene(allGenes[g_idx]);
+         free(allGenes[g_idx]);
 
-   tuple.second = read_del_ctr;
-   return tuple;
-}
+   }
+   free(allGenes);
 
-void print_read(Read* cRead) {
+   status = fclose(reads_fs);
+   status = fclose(out_fs);
+   if(status != 0)
+      perror("fclose");
+      
+   free(reads_filename);
+   free(output_filename);
 
-   printf(line_format,
-   cRead->chr, cRead->pos, cRead->seq, cRead->id,
-   cRead->strand, cRead->mismatch, cRead->occurrence,
-   cRead->size, cRead->cut, cRead->prb, cRead->cal_prb,
-   cRead->chastity);
+   return 0;
 }
 
 /*
@@ -796,3 +630,30 @@ void print_read(Read* cRead) {
  * - check for [AC] and similar entries -> done simple (see function
  * - remove_ambiguities (exchanges [XY] by the second entry)
  */
+
+#ifdef _FDEBUG_
+   if(read_nr == DEBUG_READ) {
+      printf("read nr %lu",read_nr);
+      printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
+      printf("u/d range: %d %d\n",up_range,down_range);
+      printf("u/d size: %d %d\n",u_size,d_size);
+      printf("u/d off: %d %d\n",u_off,d_off);
+      printf("add_pos/down_range-d_size: %d %d\n",additional_pos,(down_range - d_size));
+
+      printf("******************\n");
+
+      printf("%s\n",up_read->seq);
+      //printf("%s\n",new_up_seq);
+
+      printf("******************\n");
+
+      printf("%s\n",down_read->seq);
+      //printf("%s\n",new_down_seq);
+
+      printf("******************\n");
+      printf("%s\n",new_seq);
+      printf("%s\n",new_prb);
+      printf("%s\n",new_cal_prb);
+      printf("%s\n",new_chastity);
+   }
+#endif // _FDEBUG_
index 9a5b9f8..79eddf9 100644 (file)
@@ -121,7 +121,7 @@ int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
       }
 
       if (strcmp(id,"exon")==0) {
-         add_exon((*allGenes)[idx],start-1,stop);
+         add_exon((*allGenes)[idx],start,stop);
          //printf("exon start/stop: %d/%d\n",start-1,stop);
          continue;
       }
index f1e28cb..35cb015 100644 (file)
@@ -51,14 +51,14 @@ int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int
  * we want to remove/store these ambiguities in the filtered reads
  */
 
-void remove_ambiguities(char * old_seq, char* new_seq) {
-
-   FA(0);
+int remove_ambiguities(char * old_seq, char* new_seq) {
 
    int idx=0;
    int new_idx = 0;
    int old_seq_size = strlen(old_seq);
 
+   int est_deletions_ctr = 0;
+
    while(idx<old_seq_size) {
       //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
       if (old_seq[idx] == ']') {
@@ -71,6 +71,8 @@ void remove_ambiguities(char * old_seq, char* new_seq) {
          // we have a indel either on the dna or read sequence
          if (old_seq[idx+1] == '-' || old_seq[idx+2] == '-') {
             
+            if(old_seq[idx+2] == '-')
+               est_deletions_ctr += 1;
 
             while(1) { 
                new_seq[new_idx] = old_seq[idx];
@@ -93,4 +95,5 @@ void remove_ambiguities(char * old_seq, char* new_seq) {
 
    new_seq[new_idx] = '\0';
    //printf("new_seq is %d :: %s\n",new_idx,new_seq);
+   return est_deletions_ctr;
 }