+ minor changes to filterReads
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 8 Feb 2008 16:40:22 +0000 (16:40 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 8 Feb 2008 16:40:22 +0000 (16:40 +0000)
+ added some assertions to computeSpliceAlign...
+ removed a bug in the feature calculation of the label

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7726 e1793c9e-67f9-0310-80fc-b846ff1f7b36

13 files changed:
dyn_prog/Makefile
dyn_prog/fill_matrix.cpp
dyn_prog/qpalma_dp.cpp
dyn_prog/qpalma_dp.h
qpalma/Configuration.py
qpalma/DataProc.py
qpalma/computeSpliceAlignWithQuality.py
qpalma/computeSpliceWeights.py
qpalma/tools/parseGff.py
scripts/compile_dataset.py
scripts/qpalma_main.py
tools/data_tools/filterReads.c
tools/plot_param.py

index 0ccad31..d1029ac 100644 (file)
@@ -24,10 +24,10 @@ CXXFLAGS=-O3 -fPIC -ggdb
 IF=QPalmaDP
 
 all: $(OBJS) $(HDRS)
-       swig -c++ -python ${IF}.i
-       g++ $(CXXFLAGS) -I/usr/include/python2.5 -c ${IF}_wrap.cxx -o ${IF}_wrap.o
-       g++ $(CXXFLAGS) -shared -lpython2.5 $(OBJS) ${IF}_wrap.o -o _${IF}.so
-       python -c "import ${IF}"
+       swig -c++ -python ${IF}.i
+       g++ $(CXXFLAGS) -I/usr/include/python2.5 -c ${IF}_wrap.cxx -o ${IF}_wrap.o
+       g++ $(CXXFLAGS) -shared -lpython2.5 $(OBJS) ${IF}_wrap.o -o _${IF}.so
+       python -c "import ${IF}"
 
 clean:
-       rm *.o *.so *.cxx ${IF}.py ${IF}.pyc
+       rm *.o *.so *.cxx ${IF}.py ${IF}.pyc
index 1425ade..b494a2e 100644 (file)
@@ -311,6 +311,7 @@ void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var:
              tempValue = prevValue + matchmatrix[mlen*dnaChar];   /* score(gap,DNA) */
       }
 
+     //printf("tempValue: %f\n",tempValue);
 
          if (isfinite(tempValue))
          {
@@ -337,6 +338,8 @@ void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var:
         tempValue = prevValue +(matchmatrix[mlen* dnaChar +estChar]);
      }
 
+     //printf("tempValue: %f\n",tempValue);
+
          if (isfinite(tempValue))
          {
            assert(num_elem<max_num_elem) ;
@@ -363,6 +366,8 @@ void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var:
             tempValue = prevValue +(matchmatrix[estChar]); /* score(EST,gap) */
      }
 
+     //printf("tempValue: %f\n",tempValue);
+
          if (isfinite(tempValue))
          {
            assert(num_elem<max_num_elem) ;
@@ -398,6 +403,9 @@ void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var:
                int dist = j -(splice_pos-1); //_gt...ag: dist from g to g
                assert(dist>max_len) ;
                tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue;
+
+       //printf("tempValue: %f\n",tempValue);
+
                if (isfinite(tempValue))
                 {
                  assert(num_elem<max_num_elem) ;
index e2bdb35..d52e0fe 100644 (file)
@@ -1,3 +1,4 @@
+
 #include "qpalma_dp.h"
 #include <cstring>
 
@@ -24,7 +25,7 @@ Alignment::Alignment(int numQPlifs, int numq, bool use_qscores) {
       est_align            = 0;
       mmatrix_param        = 0;
       alignmentscores      = 0;
-      qualityScoresAllPaths = 0;
+      qualityFeaturesAllPaths = 0;
       mlen = 6; // score matrix: length of 6 for "- A C G T N"
 
       numQualSuppPoints = numq;
@@ -62,6 +63,8 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
     // dnaest
    DNA_ARRAY = 0;
    EST_ARRAY = 0;
+
+  //printf("[qpalma_dp] read is: %s\n",est);
   
   //possible donor positions
   int nr_donor_sites = 0 ;
@@ -96,7 +99,6 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
    //      printf("%f ",pidx,p.penalties[pidx]);
    //   printf("\n");
    //}
-   //
 
    //int h_idx;
    //for(h_idx=0;h_idx<h.len;h_idx++)
@@ -142,7 +144,7 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
   memset(alignmentscores, -1, nr_paths*sizeof(double)); //fills alignmentscores with zeros
   //printf("after memset...\n");
   
-  qualityScoresAllPaths= new penalty_struct*[nr_paths];
+  qualityFeaturesAllPaths= new penalty_struct*[nr_paths];
 
   for (int z=0; z<nr_paths; z++) {
     result_length = 0 ;
@@ -151,37 +153,43 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
     int* e_align = est_align + (est_len-1)*z ;    //pointer
     int* mparam  = mmatrix_param + mmatrix_size*z; //pointer
 
-   qualityScoresAllPaths[z] = new penalty_struct[numPlifs];
+   qualityFeaturesAllPaths[z] = new penalty_struct[numPlifs];
 
    int qidx, pidx;
    for(qidx=0;qidx<numPlifs;qidx++) {
       penalty_struct p;
+      init_penalty_struct(p);
       p.len = numQualSuppPoints;
-      p.limits = (REAL*) calloc(p.len,sizeof(REAL));
+      p.limits = (double*) calloc(p.len,sizeof(double));
 
       for(pidx=0;pidx<p.len;pidx++)
-         p.limits[pidx] = qualityScores[qidx%numPlifs].limits[pidx];
+         p.limits[pidx] = qualityScores[qidx].limits[pidx];
 
-      p.penalties = (REAL*) calloc(p.len,sizeof(REAL));
-      qualityScoresAllPaths[z][qidx] = p;
+      p.penalties = (double*) calloc(p.len,sizeof(double));
+      qualityFeaturesAllPaths[z][qidx] = p;
    }
 
    //penalty_struct* qparam = qualityScoresAllPaths + (numPlifs*z);
 
     //printf("before call to result_align...\n");
-    bool no_more_path = result_align(matrices, z, est_len, dna_len, &result_length, est, dna, prb, chastity, s_align, e_align, mparam, alignmentscores, max_score_positions, qualityScoresAllPaths[z] , currentMode);
+    bool no_more_path = result_align(matrices, z, est_len, dna_len, &result_length, est, dna, prb, chastity, s_align, e_align, mparam, alignmentscores, max_score_positions, qualityFeaturesAllPaths[z] , currentMode);
     //printf("after call to result_align...\n");
 
    //printf("z is %d\n",z);
+   //int len;
    //for(qidx=0;qidx<numPlifs;qidx++) {
    //   penalty_struct p;
-   //   p = qualityScoresAllPaths[qidx+numPlifs*z];
-   //   printf("%d ",qidx);
+   //   p = qualityScoresAllPaths[z][qidx];
+   //   printf("%d: ",qidx);
    //   for(pidx=0;pidx<p.len;pidx++)
-   //      printf("%f ",pidx,p.penalties[pidx]);
+   //      printf("%f ",p.limits[pidx]);
+   //   printf("\n");
+
+   //   for(pidx=0;pidx<p.len;pidx++)
+   //      printf("%f ",p.penalties[pidx]);
    //   printf("\n");
    //}
-   //
+
    if(z==0) {
 
    if(DNA_ARRAY != 0) {
@@ -260,25 +268,20 @@ void Alignment::getAlignmentResults(int* s_align, int* e_align,
       for (int z=0; z<nr_paths; z++) {
          for(int estChar=1;estChar<6;estChar++) {
             for(int dnaChar=0;dnaChar<6;dnaChar++) {
+
                int currentPos = (estChar-1)*6+dnaChar;
-               currentPlif = qualityScoresAllPaths[z][currentPos];
+               currentPlif = qualityFeaturesAllPaths[z][currentPos];
+
                for(int pidx=0; pidx<currentPlif.len; pidx++) {
                   qScores[ctr] = currentPlif.penalties[pidx];
+                  //printf("%f ",qScores[ctr]);
                   ctr++;
                }
+               //printf("\n");
       }}}
-   }
 
-   //for(idx=0; idx<numPathsPlifs; idx++) {
-   //   currentPlif = qualityScoresAllPaths[idx];
-   //   //printf("Size is %dx%d\n",numPathsPlifs,currentPlif.len);
-   //   for(pidx=0; pidx<currentPlif.len; pidx++) {
-   //      qScores[ctr] = currentPlif.penalties[pidx];
-   //      ctr++;
-   //   }
-   //}
-
-   //printf("\nctr is %d\n",ctr);
+      //printf("\nctr is %d\n",ctr);
+   }
    //printf("Leaving getAlignmentResults...\n");
 }
 
index 69a0ef5..34a0138 100644 (file)
@@ -71,7 +71,7 @@ class Alignment {
       int* est_align;
       int* mmatrix_param;
       double* alignmentscores;
-      struct penalty_struct** qualityScoresAllPaths;
+      struct penalty_struct** qualityFeaturesAllPaths;
 
       int dna_len;
       int est_len;
@@ -118,8 +118,8 @@ class Alignment {
          if(alignmentscores != 0)
             delete[] alignmentscores;
 
-         if(qualityScoresAllPaths != 0)
-            delete[] qualityScoresAllPaths;
+         if(qualityFeaturesAllPaths != 0)
+            delete[] qualityFeaturesAllPaths;
       }
 
       void myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
index 907872b..43c39a1 100644 (file)
@@ -18,7 +18,7 @@ fixedParamQ = cPickle.load(open('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/
 # The parameters for the QPalma algorithm
 #
 #
-C = 1000
+C = 100
 
 min_intron_len = 20
 max_intron_len = 2000
@@ -26,7 +26,7 @@ max_intron_len = 2000
 min_svm_score = 0.0 
 max_svm_score = 1.0
 
-numConstraintsPerRound = 100
+numConstraintsPerRound = 1
 
 ###############################################################################
 # 
@@ -79,10 +79,15 @@ mode = 'using_quality_scores'
 #numAccSuppPoints     = 30
 #numQualSuppPoints    = 16
 
-numLengthSuppPoints  = 20 
-numDonSuppPoints     = 20
-numAccSuppPoints     = 20
-numQualSuppPoints    =  8
+#numLengthSuppPoints  = 20 
+#numDonSuppPoints     = 20
+#numAccSuppPoints     = 20
+#numQualSuppPoints    =  8
+
+numLengthSuppPoints  = 2
+numDonSuppPoints     = 2
+numAccSuppPoints     = 2
+numQualSuppPoints    = 2
 
 min_qual = -1
 max_qual = 40
@@ -122,7 +127,7 @@ print_matrix            = False
 anzpath                 = 2
 
 if mode == 'normal':
-   fixedParam = fixedParam[:numFeatures]
+   fixedParam = fixedParamQ[:numFeatures]
 elif mode == 'using_quality_scores':
    fixedParam = fixedParamQ[:numFeatures]
 else:
@@ -137,10 +142,10 @@ else:
 ###############################################################################
 
 training_begin    =    0
-training_end      = 2000
+training_end      =    2
 
-prediction_begin  = 2000
-prediction_end    = 3500
+prediction_begin  =   10
+prediction_end    =   20
 
 joinp = os.path.join
 
@@ -153,7 +158,7 @@ remapped_path  = joinp(data_path,'remapped_solexa_data')
 
 dna_flat_fn    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
 gff_fn         = joinp(annot_path,'TAIR7_GFF3_genes_Chr%s.gff_v1')
-filtered_fn    = joinp(data_path,'allFilteredReads_04_02_2008')
+filtered_fn    = joinp(data_path,'allFilteredReads_07_02_2008')
 remapped_fn    = joinp(remapped_path,'map_best_hit.18.unambig')
 
 dataset_fn = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/chr1_dataset.pickle'
index 8db88d3..822ab82 100644 (file)
@@ -15,7 +15,11 @@ from genome_utils import load_genomic
 
 def paths_load_data(filename,expt,genome_info,PAR,test=False):
 
-   data = io_pickle.load(filename)
+   #data = io_pickle.load(filename)
+
+   #cPickle.dump(data,open(filename+'.alt','w+'))
+
+   data = cPickle.load(open(filename+'.alt'))
 
    Sequences   = data['Sequences']
    Acceptors   = data['Acceptors']
@@ -27,6 +31,7 @@ def paths_load_data(filename,expt,genome_info,PAR,test=False):
 
    return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
 
+
 def paths_load_data_solexa(expt,genome_info,PAR,test=False):
    """
 
index bb72d92..454a613 100644 (file)
@@ -5,9 +5,10 @@ import numpy
 from numpy.matlib import mat,zeros,ones,inf
 import pdb
 from Plif import Plf,base_coord,linspace
-import Configuration
+import Configuration as Conf
 
-def  computeSpliceAlignWithQuality(dna, exons):
+def  computeSpliceAlignWithQuality(dna, exons, est=None, quality=None,
+qualityPlifs=None):
    """
    Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
    Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
@@ -58,14 +59,14 @@ def  computeSpliceAlignWithQuality(dna, exons):
 
    assert totalESTLength == len(est) 
 
-   sizeMatchmatrix = Configuration.sizeMatchmatrix
+   sizeMatchmatrix = Conf.sizeMatchmatrix
    # counts the occurences of a,c,g,t,n in this order
    numChar = [0]*sizeMatchmatrix[0]
 
    # counts the occurrences of a,c,g,t,n with their quality scores
-   #trueWeightQuality = zeros((Configuration.numQualPlifs*Configuration.numQualSuppPoints,1))
-   trueWeightQuality = numpy.zeros(Configuration.numQualPlifs*Configuration.numQualSuppPoints)
-   trueWeightQuality = trueWeightQuality.reshape(Configuration.estPlifs,Configuration.dnaPlifs,Configuration.numQualSuppPoints)
+   trueWeightQualityMat = [0.0]*5 # 5 rows
+   for row in range(5):
+      trueWeightQualityMat[row] = [0.0]*6 # 6 col per row
 
    for elem in est:
       if elem == 'a':
@@ -79,13 +80,55 @@ def  computeSpliceAlignWithQuality(dna, exons):
       if elem == 'n':
          numChar[4] += 1
 
-   if Configuration.mode == 'using_quality_scores':
-      for pos,elem in enumerate(numChar):
-         if pos < Configuration.estPlifs:
-            trueWeightQuality[pos][pos+1][-1] = elem
+   for row in range(5):
+      for col in range(6):
+         trueWeightQualityMat[row][col] = [0.0]*Conf.numQualSuppPoints # supp. points per plif
 
-   trueWeightQuality = mat(trueWeightQuality.flatten())
-   trueWeightQuality = trueWeightQuality.T
+   first_exon  = dna[exons[0,0]:exons[0,1]]
+   second_exon = dna[exons[1,0]:exons[1,1]]
+   dna_part = first_exon + second_exon
+
+   assert len(dna_part) == len(est)
+
+   map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5}
+
+   if Conf.mode == 'using_quality_scores':
+
+      for idx in range(len(dna_part)):
+         dnanum = map[dna_part[idx]]
+         estnum = map[est[idx]]
+         value = quality[idx]
+
+         cur_plf = qualityPlifs[(estnum-1)*6+dnanum]
+
+         Lower = len([elem for elem in cur_plf.limits if elem <= value])
+
+         if Lower == 0:
+            trueWeightQualityMat[estnum-1][dnanum][0] += 1
+         elif Lower == len(cur_plf.limits):
+            trueWeightQualityMat[estnum-1][dnanum][-1] += 1
+         else:
+            # because we count from 0 in python
+            Lower -= 1
+            Upper = Lower+1 ; # x-werte bleiben fest
+            #print value,Lower,Upper
+            weightup  = 1.0*(value - cur_plf.limits[Lower]) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
+            weightlow = 1.0*(cur_plf.limits[Upper] - value) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
+
+            trueWeightQualityMat[estnum-1][dnanum][Upper] += weightup
+            trueWeightQualityMat[estnum-1][dnanum][Lower] += weightlow
+
+      trueWeightQuality = zeros((Conf.numQualSuppPoints*Conf.numQualPlifs,1))
+
+      ctr = 0
+      for row in range(5):
+         for col in range(6):
+            for p_idx in range(Conf.numQualSuppPoints):
+               #print ctr, row, col, p_idx
+               trueWeightQuality[ctr] = trueWeightQualityMat[row][col][p_idx]
+               ctr += 1
+
+   assert int(sum(trueWeightQuality.flatten().tolist()[0])) == len(est)
 
    totalNumChar = 0
    for idx in range(sizeMatchmatrix[0]):
@@ -95,12 +138,12 @@ def  computeSpliceAlignWithQuality(dna, exons):
 
    # writing in weight match matrix
    # matrix is saved columnwise
-   if Configuration.mode == 'normal':
+   if Conf.mode == 'normal':
       trueWeightMatch = zeros((sizeMatchmatrix[0],sizeMatchmatrix[1])) # Scorematrix fuer Wahrheit
       for idx in range(1,sizeMatchmatrix[0]):
          trueWeightMatch[idx,idx] = numChar[idx-1]
 
-   if Configuration.mode == 'using_quality_scores':
+   if Conf.mode == 'using_quality_scores':
       trueWeightMatch = zeros((sizeMatchmatrix[0],sizeMatchmatrix[1])) # Scorematrix fuer Wahrheit
       #for idx in range(1,sizeMatchmatrix[0]):
       #   trueWeightMatch[idx] = numChar[idx-1]
index 31ff877..51cfc7f 100644 (file)
@@ -53,10 +53,8 @@ def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp):
 
    # Picke die Positionen raus, an denen eine Donorstelle ist
    try:
-      #if dec:
       DonorScores = [elem for pos,elem in enumerate(don_supp) if SpliceAlign[pos] == 1]
-      #else:
-      #   DonorScores = [elem for pos,elem in enumerate(don_supp) if pos > 0 and SpliceAlign[pos-1] == 1]
+
       assert not ( -inf in DonorScores )
    except:
       print 'Error'
index 281a8b9..bc7c976 100644 (file)
@@ -91,7 +91,7 @@ def parse_gff(gff_filename):
    if currentGene != None:
       allGenes[currentGene.id] = currentGene
 
-   return allGenes,allGenesA
+   return allGenes,allGenesA
 
 
 def createGffPickle(annotFile,pickleFile):
index aec5fc5..3a9087d 100644 (file)
@@ -98,6 +98,8 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    
    # Iterate over all remapped reads in order to generate for each read an
    # training / prediction example
+   instance_counter = 0
+
    for id,currentFRead in all_filtered_reads.items():
 
       try:
@@ -105,9 +107,12 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
       except:
          currentRReads = None
 
-      gene_id = currentFRead['gene_id']
-      chr = currentFRead['chr']
-      currentGene = allGenes[chr][gene_id]
+      try:
+         gene_id = currentFRead['gene_id']
+         chro = currentFRead['chr']
+         currentGene = allGenes[chro][gene_id]
+      except:
+         pdb.set_trace()
 
       if currentFRead['strand'] != '+':
          continue
@@ -126,6 +131,14 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
       SplitPositions.append(spos)
       Ids.append(id)
 
+      instance_counter += 1
+
+      if instance_counter % 100 == 0:
+         print 'processed %d examples' % instance_counter
+
+      if instance_counter == 1000:
+         break
+
    dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
    'Exons':Exons, 'Ests':Ests, 'Qualities':Qualities,\
    'SplitPositions':SplitPositions,'Ids':Ids}
@@ -171,7 +184,7 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    spos = fRead['splitpos']
 
    assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
-   assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
+   assert (exon_stop - p_start) + (p_stop - exon_start) + 2 == 36, 'Invalid read size'
    assert p_stop - p_start >= 36
 
    currentExons = zeros((2,2),dtype=numpy.int)
@@ -301,6 +314,8 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
 
    acc = [-inf] + acc[:-1]
 
+   currentExons[1,1] += 1
+
    return currentSeq, acc, don, currentExons, currentReadSeq, quality, spos
 
 
index 66bbe82..27aac5a 100644 (file)
@@ -59,74 +59,27 @@ class QPalma:
       #compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,False)
       #compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,True)
 
-      #gen_file= '%s/genome.config' % self.ARGS.basedir
-      #ginfo_filename = 'genome_info.pickle'
-      #self.genome_info = fetch_genome_info(ginfo_filename)
-      #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
-      #self.ARGS.train_with_splicesitescoreinformation = False
+      data_filename = Conf.dataset_fn
+      Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS)
 
       # Load the whole dataset 
       if Conf.mode == 'normal':
-         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
-         Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
          self.use_quality_scores = False
 
       elif Conf.mode == 'using_quality_scores':
-
-         #Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
-         data_filename = Conf.dataset_fn
-
-         Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS)
-
-         #end = 1000
-         self.Sequences   = Sequences
-         self.Exons       = Exons
-         self.Ests        = Ests
-         self.Qualities   = Qualities
-         self.SplitPos    = SplitPos
-         self.Donors      = Donors
-         self.Acceptors   = Acceptors
-
-         min_score = 100
-         max_score = -100
-
-         for elem in self.Acceptors:
-            for score in elem:
-               if score != -inf:
-                  min_score = min(min_score,score)
-                  max_score = max(max_score,score)
-
-         print 'Acceptors max/min: %f / %f' % (max_score,min_score)
-
-         min_score = 100
-         max_score = -100
-
-         for elem in self.Donors:
-            for score in elem:
-               if score != -inf:
-                  min_score = min(min_score,score)
-                  max_score = max(max_score,score)
-
-         print 'Donor max/min: %f / %f' % (max_score,min_score)
-
-         min_score = 10000
-         max_score = 0
-         mean_intron_len = 0
-
-         for elem in self.Exons:
-            intron_len = int(elem[1,0] - elem[0,1])
-            mean_intron_len += intron_len
-            min_score = min(min_score,intron_len)
-            max_score = max(max_score,intron_len)
-
-         mean_intron_len /= 1.0*len(self.Exons)
-         print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
-
-
          self.use_quality_scores = True
       else:
          assert(False)
 
+      self.Sequences   = Sequences
+      self.Exons       = Exons
+      self.Ests        = Ests
+      self.Qualities   = Qualities
+      self.SplitPos    = SplitPos
+      self.Donors      = Donors
+      self.Acceptors   = Acceptors
+
+      calc_info(self.Acceptors,self.Donors,self.Exons)
       print 'leaving constructor...'
 
    def plog(self,string):
@@ -164,11 +117,6 @@ class QPalma:
       # Initialize parameter vector  / param = numpy.matlib.rand(126,1)
       param = Conf.fixedParam 
 
-      # Set the parameters such as limits penalties for the Plifs
-      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
-      #pdb.set_trace()
-
       lengthSP    = Conf.numLengthSuppPoints
       donSP       = Conf.numDonSuppPoints
       accSP       = Conf.numAccSuppPoints
@@ -177,18 +125,29 @@ class QPalma:
       numq        = Conf.numQualSuppPoints
       totalQualSP = Conf.totalQualSuppPoints
 
+      i1 = range(0,lengthSP)
+      i2 = range(lengthSP,lengthSP+donSP)
+      i3 = range(lengthSP+donSP,lengthSP+donSP+accSP)
+      i4 = range(lengthSP+donSP+accSP,lengthSP+donSP+accSP+mmatrixSP)
+      i5 = range(lengthSP+donSP+accSP+mmatrixSP,lengthSP+donSP+accSP+mmatrixSP+totalQualSP)
+      intervals = [] #i5,\ #i2,#i3,#i4,#i5]
+      zero_out(param,intervals)
+      pdb.set_trace()
+
+      # Set the parameters such as limits penalties for the Plifs
+      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
       # Initialize solver 
       if Conf.USE_OPT:
          self.plog('Initializing problem...\n')
          solver = SIQPSolver(Conf.numFeatures,numExamples,Conf.C,self.logfh)
-         solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
-         solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
+         #solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
+         #solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
 
       # stores the number of alignments done for each example (best path, second-best path etc.)
       num_path = [anzpath]*numExamples
       # stores the gap for each example
       gap      = [0.0]*numExamples
-
       #############################################################################################
       # Training
       #############################################################################################
@@ -197,9 +156,12 @@ class QPalma:
       currentPhi = zeros((Conf.numFeatures,1))
       totalQualityPenalties = zeros((totalQualSP,1))
 
+      suboptimal_example = 0
       iteration_nr = 0
       param_idx = 0
       const_added_ctr = 0
+
+      # the main training loop
       while True:
          if iteration_nr == iteration_steps:
             break
@@ -210,6 +172,7 @@ class QPalma:
 
             dna = Sequences[exampleIdx] 
             est = Ests[exampleIdx] 
+            est = "".join(est)
 
             if Conf.mode == 'normal':
                quality = [40]*len(est)
@@ -218,29 +181,33 @@ class QPalma:
                quality = Qualities[exampleIdx]
 
             exons = Exons[exampleIdx] 
-            # NoiseMatrix = Noises[exampleIdx] 
             don_supp = Donors[exampleIdx] 
             acc_supp = Acceptors[exampleIdx] 
 
             # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
-            trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+            if Conf.mode == 'using_quality_scores':
+               trueSpliceAlign, trueWeightMatch, trueWeightQuality =\
+               computeSpliceAlignWithQuality(dna, exons, est, quality, qualityPlifs)
+            else:
+               trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
             
             # Calculate the weights
             trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
             trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
 
-            #idx_lst = [p for p,e in enumerate(trueWeightIntron) if e > 1e-5]
-            #print idx_lst
-            #print [h.limits[p] for p in idx_lst]
-
-            currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
-            currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
-            currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
-            currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
+            currentPhi[0:lengthSP]                                            = mat(h.penalties[:]).reshape(lengthSP,1)
+            currentPhi[lengthSP:lengthSP+donSP]                               = mat(d.penalties[:]).reshape(donSP,1)
+            currentPhi[lengthSP+donSP:lengthSP+donSP+accSP]                   = mat(a.penalties[:]).reshape(accSP,1)
+            currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP]   = mmatrix[:]
 
             if Conf.mode == 'using_quality_scores':
                totalQualityPenalties = param[-totalQualSP:]
-               currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                    = totalQualityPenalties[:]
+               currentPhi[lengthSP+donSP+accSP+mmatrixSP:]                    = totalQualityPenalties[:]
+
+            zero_out(currentPhi,intervals)
+            zero_out(trueWeight,intervals)
+
+            #pdb.set_trace()
 
             # Calculate w'phi(x,y) the total score of the alignment
             trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
@@ -253,7 +220,6 @@ class QPalma:
 
             AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
             AlignmentScores[0] = trueAlignmentScore
-
             ################## Calculate wrong alignment(s) ######################
 
             # Compute donor, acceptor with penalty_lookup_new
@@ -293,7 +259,6 @@ class QPalma:
 
             # Create the alignment object representing the interface to the C/C++ code.
             currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
-
             c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
             #print 'Calling myalign...'
             # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
@@ -302,9 +267,6 @@ class QPalma:
              acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
              print_matrix)
 
-            #print 'After calling myalign...'
-            #print 'Calling getAlignmentResults...'
-
             c_SpliceAlign       = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
             c_EstAlign          = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
             c_WeightMatch       = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
@@ -354,10 +316,12 @@ class QPalma:
             del c_WeightMatch
             del c_DPScores
             del c_qualityPlifsFeatures
-            del currentAlignment
+            #del currentAlignment
 
             newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
             newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
+
+            newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],Conf.totalQualSuppPoints)
             # Calculate weights of the respective alignments Note that we are
             # calculating n-best alignments without hamming loss, so we
             # have to keep track which of the n-best alignments correspond to
@@ -367,7 +331,6 @@ class QPalma:
             # equivalence to the true alignment for each decoded alignment.
             true_map = [0]*(num_path[exampleIdx]+1)
             true_map[0] = 1
-            path_loss   = [0]*(num_path[exampleIdx])
 
             for pathNr in range(num_path[exampleIdx]):
                #print 'decodedWeights' 
@@ -376,33 +339,21 @@ class QPalma:
                acc_supp)
 
                decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
-               for qidx in range(Conf.totalQualSuppPoints):
-                  decodedQualityFeatures[qidx] = newQualityPlifsFeatures[(pathNr*Conf.totalQualSuppPoints)+qidx]
-
-               #pdb.set_trace()
-
-               path_loss[pathNr] = 0
-               # sum up positionwise loss between alignments
-               for alignPosIdx in range(newSpliceAlign[pathNr,:].shape[1]):
-                  if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
-                     path_loss[pathNr] += 1
-
-               #pdb.set_trace()
-
+               decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
                # Gewichte in restliche Zeilen der Matrix speichern
-               wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
-               allWeights[:,pathNr+1] = wp
-
-               #pdb.set_trace()
+               allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
 
                hpen = mat(h.penalties).reshape(len(h.penalties),1)
                dpen = mat(d.penalties).reshape(len(d.penalties),1)
                apen = mat(a.penalties).reshape(len(a.penalties),1)
-               features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
+               features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
 
-               AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
+               zero_out(features,intervals)
+               zero_out(allWeights[:,pathNr+1],intervals)
 
-               # Check wether scalar product + loss equals viterbi score
+               #pdb.set_trace()
+
+               AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
 
                distinct_scores = False
                if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
@@ -412,11 +363,23 @@ class QPalma:
                if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
                   pdb.set_trace()
 
+               if exampleIdx == 1:
+                  self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
+                  self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
+
+               pdb.set_trace()
+
                # if the pathNr-best alignment is very close to the true alignment consider it as true
                if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
                   true_map[pathNr+1] = 1
 
-               # assert AlignmentScores[0] > max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
+               #pdb.set_trace()
+
+               #assert AlignmentScores[0] <= max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
+               if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
+                  suboptimal_example += 1
+                  self.plog("suboptimal_example %d\n" %exampleIdx)
+
 
                # the true label sequence should not have a larger score than the maximal one WHYYYYY?
                # this means that all n-best paths are to close to each other 
@@ -431,10 +394,35 @@ class QPalma:
                      firstFalseIdx = map_idx
                      break
 
+               #if exampleIdx == 1:
+               if False:
+
+                  self.plog("Is considered as: %d\n" % true_map[1])
+
+                  result_len = currentAlignment.getResultLength()
+                  c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
+                  c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
+
+                  currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
+
+                  dna_array = [0.0]*result_len
+                  est_array = [0.0]*result_len
+
+                  for r_idx in range(result_len):
+                     dna_array[r_idx] = c_dna_array[r_idx]
+                     est_array[r_idx] = c_est_array[r_idx]
+
+                  _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
+                  _newEstAlign = newEstAlign[0].flatten().tolist()[0]
+                  line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
+                  self.plog(line1+'\n')
+                  self.plog(line2+'\n')
+                  self.plog(line3+'\n')
+
                # if there is at least one useful false alignment add the
                # corresponding constraints to the optimization problem
                if firstFalseIdx != -1:
-                  trueWeights       = allWeights[:,0]
+                  trueWeights       = trueWeight
                   firstFalseWeights = allWeights[:,firstFalseIdx]
                   differenceVector  = trueWeights - firstFalseWeights
                   #pdb.set_trace()
@@ -476,7 +464,10 @@ class QPalma:
          #
          # end of one iteration through all examples
          #
-         if self.noImprovementCtr == numExamples+1:
+
+         self.plog("suboptimal rounds %d\n" %suboptimal_example)
+
+         if self.noImprovementCtr == numExamples*2:
             break
 
          iteration_nr += 1
@@ -580,7 +571,11 @@ class QPalma:
          acc_supp = Acceptors[exampleIdx] 
 
          # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
-         trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+         if Conf.mode == 'using_quality_scores':
+            trueSpliceAlign, trueWeightMatch, trueWeightQuality =\
+            computeSpliceAlignWithQuality(dna, exons, est, quality, qualityPlifs)
+         else:
+            trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
          
          # Calculate the weights
          trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
@@ -704,10 +699,10 @@ class QPalma:
          _newSpliceAlign = newSpliceAlign.flatten().tolist()[0]
          _newEstAlign = newEstAlign.flatten().tolist()[0]
 
-         #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
-         #self.plog(line1+'\n')
-         #self.plog(line2+'\n')
-         #self.plog(line3+'\n')
+         line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
+         self.plog(line1+'\n')
+         self.plog(line2+'\n')
+         self.plog(line3+'\n')
 
          weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
 
@@ -819,7 +814,8 @@ class QPalma:
       e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
 
       self.plog(("Example %d"%exampleIdx) + str(exons) + " "+  str(newExons) + "\n")
-      self.plog(("SpliceAlign " + str(SpliceAlign)+ "\n"))
+      #self.plog(("SpliceAlign " + str(SpliceAlign)+ "\n"))
+      self.plog(("read: " + str(est)+ "\n"))
 
       if len(newExons) == 4:
          e1_begin,e1_end = newExons[0],newExons[1]
@@ -936,38 +932,46 @@ def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
 
    return line1,line2,line3
 
-   #exon_length[-1] = exon_length[-1] - tEnd + tIDX[last_identity] + 1
-
-   #tStart = tIDX[first_identity]
-   #tStarts[0] = tIDX[first_identity]
-   #tEnd = tIDX[last_identity]
-   #tEnds[-1] = tIDX[last_identity]
-   #qStart = qIDX[first_identity]
-   #qStarts[0] = qIDX[first_identity]
-   #qEnd = qIDX[last_identity]
-   #qEnds[-1] = qIDX[last_identity]
-
-   #align_info.qStart = qStart
-   #align_info.qEnd = qEnd
-   #align_info.tStart = tStart
-   #align_info.tEnd = tEnd
-   #align_info.num_exons = num_exons
-   #align_info.qExonSizes = qExonSizes
-   #align_info.qStarts = qStarts
-   #align_info.qEnds = qEnds
-   #align_info.tExonSizes =tExonSizes
-   #align_info.tStarts = tStarts
-   #align_info.tEnds = tEnds
-   #align_info.exon_length = exon_length
-   #align_info.identity = identity
-   #align_info.ssQuality = ssQuality
-   #align_info.exonQuality = exonQuality
-   #align_info.comparison = comparison
-   #align_info.qIDX = qIDX
-   #align_info.tIDX = tIDX 
-   #align_info.dna_letters = dna_letters
-   #align_info.est_letters = est_letters
+def calc_info(acc,don,exons):
+   min_score = 100
+   max_score = -100
+
+   for elem in acc:
+      for score in elem:
+         if score != -inf:
+            min_score = min(min_score,score)
+            max_score = max(max_score,score)
+
+   print 'Acceptors max/min: %f / %f' % (max_score,min_score)
+
+   min_score = 100
+   max_score = -100
+
+   for elem in don:
+      for score in elem:
+         if score != -inf:
+            min_score = min(min_score,score)
+            max_score = max(max_score,score)
+
+   print 'Donor max/min: %f / %f' % (max_score,min_score)
+
+   min_score = 10000
+   max_score = 0
+   mean_intron_len = 0
+
+   for elem in exons:
+      intron_len = int(elem[1,0] - elem[0,1])
+      mean_intron_len += intron_len
+      min_score = min(min_score,intron_len)
+      max_score = max(max_score,intron_len)
+
+   mean_intron_len /= 1.0*len(exons)
+   print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
 
+def zero_out(vec,intervals):
+   for interval in intervals:
+      for pos in interval:
+         vec[pos] = 0.0
 
 if __name__ == '__main__':
    qpalma = QPalma()
index 48fb924..71679c4 100644 (file)
@@ -23,17 +23,11 @@ int compare_gene_struct(struct gene* a, struct gene* b) {
 }
 
 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
-
 void sort_genes(struct gene*** allGenes, int numGenes);
-
 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
-
 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
-
 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
-
-int fitting(char* up_prb, char* down_prb);
-
+int fitting(char* up_prb, char* down_prb, int u_off, int d_off);
 void remove_ambiguities(char * old_seq, char* new_seq);
 
 static char *info = "Usage is:\n./filterReads gff reads output";
@@ -53,6 +47,7 @@ const int max_overlap = 30;
 int read_nr = 1;
 
 int combined_reads = 0;
+
 int main(int argc, char* argv[]) {
 
    if(argc != 5) {
@@ -259,53 +254,11 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          goto next_read;
       }
 
-      FA(currentGene != 0);
-
-      if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
-
-         if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
-
-            if (uov != 0 && dov != 0)
-               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
-
-            for(up_idx=0;up_idx<uov;up_idx++) {
-               free_read(upstream_overlap[up_idx]);
-               free(upstream_overlap[up_idx]);
-            }
-
-            for(down_idx=0;down_idx<dov;down_idx++) {
-               free_read(downstream_overlap[down_idx]);
-               free(downstream_overlap[down_idx]);
-            }
-
-            uov = dov = 0;
+      FA(strlen(seq) >= read_size);
 
-            gene_idx++;
-            exon_idx = 1;
-            currentGene = (*allGenes)[gene_idx];
-            while( currentGene == 0 && gene_idx < numGenes) {
-               currentGene = (*allGenes)[gene_idx];
-               gene_idx++;
-            }
-            //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
-            continue;
-         }
-
-         if ( pos < currentGene->start ) { // go to next read
-            skippedReadCtr++;
-            
-            next_read:
-            
-            lineBeginPtr = lineEndPtr;
-            while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
-            lineEndPtr++;
-            readCtr += 1;
-            current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
-            current_line[lineEndPtr-lineBeginPtr] = '\0';
-            continue;
-         }
+      FA(currentGene != 0);
 
-      } else { // read IS within gene borders
+      if ( currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop) { // read is not within gene borders
 
          exon_label:
 
@@ -340,6 +293,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
                free_read(downstream_overlap[down_idx]);
                free(downstream_overlap[down_idx]);
             }
+
             uov = dov = 0;
 
             exon_idx++;
@@ -377,10 +331,54 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
          uselessReadCtr++;
          goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
+
+      } else { // read is not within gene borders
+
+         if (uov != 0 && dov != 0)
+            combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
+
+         for(up_idx=0;up_idx<uov;up_idx++) {
+            free_read(upstream_overlap[up_idx]);
+            free(upstream_overlap[up_idx]);
+         }
+
+         for(down_idx=0;down_idx<dov;down_idx++) {
+            free_read(downstream_overlap[down_idx]);
+            free(downstream_overlap[down_idx]);
+         }
+
+         uov = dov = 0;
+
+         if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
+            gene_idx++;
+            exon_idx = 1;
+            currentGene = (*allGenes)[gene_idx];
+            while( currentGene == 0 && gene_idx < numGenes) {
+               currentGene = (*allGenes)[gene_idx];
+               gene_idx++;
+            }
+            //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
+            continue;
+         }
+
+         if ( pos < currentGene->start ) { // go to next read
+            skippedReadCtr++;
+            
+            next_read:
+            
+            lineBeginPtr = lineEndPtr;
+            while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
+            lineEndPtr++;
+            readCtr += 1;
+            current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
+            current_line[lineEndPtr-lineBeginPtr] = '\0';
+            continue;
+         }
       }
    }
 
-   combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
+   if (uov != 0 && dov != 0)
+      combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
 
    for(up_idx=0;up_idx<uov;up_idx++) {
       free_read(upstream_overlap[up_idx]);
@@ -418,9 +416,9 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
 
-   printf("up/down size is %d/%d\n",up_size,down_size);
+   //printf("up/down size is %d/%d\n",up_size,down_size);
 
-   int up_idx, down_idx, overlap, success;
+   int up_idx, down_idx, success;
 
    char* up_used_flag = calloc(up_size,sizeof(char));
    char* down_used_flag = calloc(down_size,sizeof(char));
@@ -434,10 +432,9 @@ void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, R
 
       currentUpRead = upstream[up_idx];
 
-      overlap = exon_stop - currentUpRead->pos;
-
       for(down_idx=0;down_idx<down_size;down_idx++) {
-         if( down_used_flag[down_idx] == 1)
+
+         if( up_used_flag[up_idx] == 1 || down_used_flag[down_idx] == 1)
             continue;
 
          currentDownRead = downstream[down_idx];
@@ -447,7 +444,7 @@ void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, R
 
          success = join_reads(exon_stop,exon_start,currentUpRead,currentDownRead,out_fs,gene_id,exon_idx);
          
-         if(success) {
+         if(success == 1) {
             up_used_flag[up_idx] = 1;
             down_used_flag[down_idx] = 1;
          }
@@ -480,7 +477,7 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    u_size = -1;
    d_size = -1;
 
-   printf("possible range %d %d\n",up_range,down_range);
+   //printf("possible range %d %d\n",up_range,down_range);
 
    if(up_range+down_range < read_size)
       return 0;
@@ -510,16 +507,14 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    int p_stop   = exon_start + d_size - 1;
 
    u_off = p_start - up_read->pos;
-   d_off = exon_start - down_read->pos + 1;
+   d_off = exon_start - down_read->pos;
 
    FA(u_size != -1);
    FA(d_size != -1);
    FA(u_size + d_size == read_size);
 
-   printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
-   printf("u/d size: %d %d\n",u_size,d_size);
-   printf("u/d off: %d %d\n",u_off,d_off);
-
+   //printf("%lu %lu\n",up_read->id,down_read->id);
+   
    remove_ambiguities(up_read->seq,new_up_seq);
    remove_ambiguities(down_read->seq,new_down_seq);
 
@@ -539,10 +534,33 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size);
    new_chastity[read_size] = '\0';
 
-   //printf("%s\n",new_seq);
-   //printf("%s\n",new_prb);
-   //printf("%s\n",new_cal_prb);
-   //printf("%s\n",new_chastity);
+   int status = fitting(up_read->prb,down_read->prb,u_off,d_off);
+
+   if(status != 1)
+      return status;
+
+   //if( read_nr == 13 || read_nr == 14 )  {
+   //   printf("read nr %d",read_nr);
+   //   printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
+   //   printf("u/d range: %d %d\n",up_range,down_range);
+   //   printf("u/d size: %d %d\n",u_size,d_size);
+   //   printf("u/d off: %d %d\n",u_off,d_off);
+   //   printf("******************\n");
+
+   //   printf("%s\n",up_read->seq);
+   //   printf("%s\n",down_read->seq);
+
+   //   printf("******************\n");
+
+   //   printf("%s\n",new_up_seq);
+   //   printf("%s\n",new_down_seq);
+
+   //   printf("******************\n");
+   //   printf("%s\n",new_seq);
+   //   printf("%s\n",new_prb);
+   //   printf("%s\n",new_cal_prb);
+   //   printf("%s\n",new_chastity);
+   //}
 
    //// The four last integers specify the positions of 
    //// p_start : the position in the dna where the first read starts
@@ -554,6 +572,7 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
       read_nr,up_read->chr,up_read->strand,new_seq,u_size,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
 
    read_nr++;
+   combined_reads++;
 
    free(new_up_seq);
    free(new_down_seq); 
@@ -566,86 +585,37 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    return 1;
 }
 
-int fitting(char* up_prb, char* down_prb) {
-   double epsilon_mean = 30.0;
-   double epsilon_var = 30.0;
-   int w_size = 6;
-
+int fitting(char* up_prb, char* down_prb, int u_off, int d_off) {
    double current_mean_up = 0;
-   double current_variance_up = 0;
    double current_mean_down = 0;
-   double current_variance_down = 0;
 
-   double* mean_up       = malloc(sizeof(double)*read_size-2*w_size);
-   double* variance_up   = malloc(sizeof(double)*read_size-2*w_size);
-   double* mean_down     = malloc(sizeof(double)*read_size-2*w_size);
-   double* variance_down = malloc(sizeof(double)*read_size-2*w_size);
+   int w_size = 3;
 
-   int iidx = -1;
-   int uidx;
-   int didx;
+   int idx;
 
-   for(uidx=iidx-w_size;uidx<iidx;uidx++) {
-      didx = uidx+w_size;
-      current_mean_up += up_prb[uidx]-50;
-      current_mean_down += up_prb[didx]-50;
-   }
+   for(idx=0;idx<w_size;idx++) {
+      current_mean_up += up_prb[u_off+idx]-50;
+      current_mean_up += up_prb[u_off-idx]-50;
+      current_mean_up -= up_prb[idx]-50;
 
-   current_mean_up /= w_size;
-   current_mean_down /= w_size;
-
-   for(uidx=iidx-w_size;uidx<iidx;uidx++) {
-      didx = uidx+w_size;
-      current_variance_up += pow(up_prb[uidx]-50 - current_mean_up,2);
-      current_variance_down += pow(up_prb[didx]-50 - current_mean_up,2);
+      current_mean_down += up_prb[d_off+idx]-50;
+      current_mean_down += up_prb[d_off-idx]-50;
+      current_mean_down -= up_prb[idx]-50;
    }
 
-   current_variance_up /= (w_size-1);
-   current_variance_down /= (w_size-1);
-
-   for(iidx=w_size;iidx<30;iidx++) {
-      for(uidx=iidx-w_size;uidx<iidx;uidx++) {
-         didx = uidx+w_size;
-         mean_up[iidx-w_size] += down_prb[uidx]-50;
-         mean_down[iidx-w_size] += down_prb[didx]-50;
-
-      }
-      mean_up[iidx-w_size] /= w_size;
-      mean_down[iidx-w_size] /= w_size;
+   current_mean_up /= (2*w_size+1);
+   current_mean_down /= (2*w_size+1);
 
-      for(uidx=iidx-w_size;uidx<iidx;uidx++) {
-         didx = uidx+w_size;
-         variance_up[iidx-w_size] += pow(down_prb[uidx]-50 - mean_up[iidx-w_size],2);
-         variance_down[iidx-w_size] += pow(down_prb[didx]-50 - mean_down[iidx-w_size],2);
-      }
-      variance_up[iidx-w_size] /= (w_size-1);
-      variance_down[iidx-w_size] /= (w_size-1);
-
-      //printf("means: %f %f %f %f, variances: %f %f %f %f\n",current_mean_up,current_mean_down,mean_up,mean_down,current_variance_up,current_variance_down,variance_up,variance_down);
-      //if ( abs(current_mean_up - mean_up) < epsilon_mean && abs(current_variance_up - variance_up) < epsilon_var 
-      //&& abs(current_mean_down - mean_down) < epsilon_mean && abs(current_variance_down - variance_down) < epsilon_var )
-      //   return iidx;
-   }
+   double ratio;
+   if( current_mean_up  > current_mean_down)
+      ratio = current_mean_down / current_mean_up;
+   else
+      ratio = current_mean_up / current_mean_down;
 
-   int bidx;
-   int bestIdx = -1;
-   double min = 1000.0;
-   for(bidx=0;bidx<read_size-2*w_size;bidx++) {
-      if ( abs(current_mean_up - mean_up[bidx]) < epsilon_mean && abs(current_variance_up - variance_up[bidx]) < epsilon_var 
-      && abs(current_mean_down - mean_down[bidx]) < epsilon_mean && abs(current_variance_down - variance_down[bidx]) < epsilon_var ) {
-         if ( abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx])) {
-            min = abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx]);
-            bestIdx = bidx;
-         }
-      }
-   }
-
-   free(mean_up);
-   free(variance_up);
-   free(mean_down);
-   free(variance_down);
+   if (ratio < 0.5)
+      return 0;
 
-   return bestIdx;
+   return 1;
 }
 
 void remove_ambiguities(char * old_seq, char* new_seq) {
index 667000f..90bf300 100644 (file)
@@ -70,8 +70,10 @@ def plot_param(filename,train_with_intronlengthinformation=1):
     # plot quality score transformations
     pylab.figure()
     for pos,plif in enumerate(qualityPlifs):
-      if pos in [1,8,15,22]:
-         pylab.plot(plif.limits,plif.penalties,'b^-',hold=True)
+      #if pos in [1,8,15,22]:
+      #   pylab.plot(plif.limits,plif.penalties,'b^-',hold=True)
+
+      pylab.plot(plif.limits,plif.penalties,'b^-',hold=True)
     pylab.xlabel('quality value',fontsize=20)
     pylab.ylabel('transition score',fontsize=20)