+ extended dataset compilation function
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 5 Feb 2008 21:56:44 +0000 (21:56 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 5 Feb 2008 21:56:44 +0000 (21:56 +0000)
+ added regularization parameters
+ fixed a bug in the regularizer (off-diagonal terms)
+ decreased number of support points

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7688 e1793c9e-67f9-0310-80fc-b846ff1f7b36

dyn_prog/qpalma_dp.cpp
qpalma/Configuration.py
qpalma/SIQP.py
qpalma/computeSpliceWeights.py
qpalma/set_param_palma.py
scripts/compile_dataset.py
scripts/qpalma_main.py
tools/data_tools/filterReads.py [new file with mode: 0644]

index 7786745..c2cae37 100644 (file)
@@ -96,6 +96,11 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
    //      printf("%f ",pidx,p.penalties[pidx]);
    //   printf("\n");
    //}
+   //
+
+   //int h_idx;
+   //for(h_idx=0;h_idx<h.len;h_idx++)
+   //   printf("%d %f %f\n",h_idx,h.limits[h_idx],h.penalties[h_idx]);
   
   //printf("calling fill_matrix...\n");
   fill_matrix(nr_paths, matrices, est_len, dna_len, est, dna, prb, &h, matchmatrix, qualityScores, donor, acceptor, remove_duplicate_scores, nr_donor_sites, donor_sites, max_score_positions,currentMode);
@@ -120,7 +125,7 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
       mmatrix_param_size = (mlen*mlen)*nr_paths;
       mmatrix_size = mlen*mlen;
    }
-
+   
    alignmentscores_size = nr_paths; //alignment score for each path/matrix
    numPathsPlifs = numPlifs*nr_paths; //alignment score for each path/matrix
 
index 1d07457..eb913fd 100644 (file)
@@ -20,8 +20,8 @@ fixedParamQ = cPickle.load(open('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/
 #
 C = 1
 
-min_intron_len = 10
-max_intron_len = 1000
+min_intron_len = 20
+max_intron_len = 2000
 
 min_svm_score = 0.0 
 max_svm_score = 1.0
@@ -72,10 +72,15 @@ mode = 'using_quality_scores'
 # instead.
 ###############################################################################
 
-numDonSuppPoints     = 30
-numAccSuppPoints     = 30
-numLengthSuppPoints  = 30 
-numQualSuppPoints    = 16
+#numLengthSuppPoints  = 30 
+#numDonSuppPoints     = 30
+#numAccSuppPoints     = 30
+#numQualSuppPoints    = 16
+
+numLengthSuppPoints  = 20 
+numDonSuppPoints     = 20
+numAccSuppPoints     = 20
+numQualSuppPoints    = 4
 
 min_qual = -1
 max_qual = 40
@@ -130,10 +135,10 @@ else:
 ###############################################################################
 
 training_begin    =    0
-training_end      = 1500
+training_end      = 2000
 
-prediction_begin  = 0
-prediction_end    = 1500
+prediction_begin  = 2000
+prediction_end    = 3500
 
 joinp = os.path.join
 
@@ -144,9 +149,9 @@ original_path  = joinp(data_path,'original_solexa_data')
 annot_path     = joinp(data_path,'annotation_data')
 remapped_path  = joinp(data_path,'remapped_solexa_data')
 
-dna_flat_fn    = joinp(data_path,'allGenes.pickle')
-gff_fn         = joinp(annot_path,'TAIR7_GFF3_genes_Chr1.gff_v1')
-filtered_fn    = joinp(data_path,'filteredReads_1_recent')
+dna_flat_fn    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+gff_fn         = joinp(annot_path,'TAIR7_GFF3_genes_Chr%s.gff_v1')
+filtered_fn    = joinp(data_path,'allFilteredReads_04_02_2008')
 remapped_fn    = joinp(remapped_path,'map_best_hit.18.unambig')
 
 dataset_fn = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/chr1_dataset.pickle'
@@ -163,6 +168,5 @@ assert numLengthSuppPoints > 1
 assert numQualSuppPoints   > 1
 
 assert os.path.exists(dna_flat_fn), 'DNA data does not exist!'
-assert os.path.exists(gff_fn), 'EST/Reads data does not exist!'
 assert os.path.exists(filtered_fn), 'EST/Reads data does not exist!'
 assert os.path.exists(remapped_fn), 'EST/Reads data does not exist!'
index 08e51f6..638f41e 100644 (file)
@@ -72,22 +72,46 @@ class SIQP:
       totalQualSP = Configuration.totalQualSuppPoints
       numQPlifs = Configuration.numQualPlifs 
 
-      regC = self.numFeatures / 1.0*self.numExamples
+
+      #lengthGroupParam  = 5*1e-9
+      #spliceGroupParam  = 1e-8
+
+      lengthGroupParam  = 5*1e-9
+      spliceGroupParam  = 1e-9
+
+      self.P[0:lengthSP,0:lengthSP] *= lengthGroupParam
+
+      beg = lengthSP
+      end = lengthSP+donSP+accSP
+      self.P[beg:end,beg:end]       *= spliceGroupParam
+
+      regC = self.numFeatures / (1.0*self.numExamples)
+
+      #cfactor = 1e-8
+      cfactor = 5e-9
 
       for j in range(lengthSP-1):
-         self.P[j,j+1] = -1.0 
-         self.P[j+1,j] = -1.0
-         self.P[j,j] += 1.0
+         self.P[j,j+1]    = -cfactor
+         self.P[j+1,j]    = -cfactor
+         self.P[j,j]     += cfactor
+         self.P[j+1,j+1] += cfactor
+
+      #cfactor = 1e-8
+      cfactor = 5e-9
 
       for j in range(lengthSP,lengthSP+donSP-1):
-         self.P[j,j+1] = -1.0 
-         self.P[j+1,j] = -1.0
-         self.P[j,j] += 1.0
+         cfactor = spliceGroupParam
+         self.P[j,j+1] = -cfactor
+         self.P[j+1,j] = -cfactor
+         self.P[j,j] += cfactor
+         self.P[j+1,j+1] += cfactor
          
       for j in range(lengthSP+donSP,lengthSP+donSP+accSP-1):
-         self.P[j,j+1] = -1.0 
-         self.P[j+1,j] = -1.0
-         self.P[j,j] += 1.0
+         cfactor = spliceGroupParam
+         self.P[j,j+1] = -cfactor
+         self.P[j+1,j] = -cfactor
+         self.P[j,j] += cfactor
+         self.P[j+1,j+1] += cfactor
 
       offset = lengthSP+donSP+accSP+mmatrixSP
       for k in range(numQPlifs):
@@ -101,17 +125,30 @@ class SIQP:
 
       # 0.25 for each was already good
 
-      lengthGroupParam  = 0.005
-      spliceGroupParam  = 0.005
+      matchGroupParam   = 1.0 
+      qualityGroupParam = 1.0 
 
-      matchGroupParam   = 0.495
-      qualityGroupParam = 0.495
+      # lengthGroupParam  = 0.005
+      # spliceGroupParam  = 0.005
+      # matchGroupParam   = 0.495
+      # qualityGroupParam = 0.495
 
-      self.P[0:lengthSP,0:lengthSP] *= lengthGroupParam
+      #lengthGroupParam  = lengthSP / (1.0*lengthSP+donSP+accSP)
+      #spliceGroupParam  = donSP+accSP / (1.0*lengthSP+donSP+accSP)
+      #matchGroupParam   = mmatrixSP/(1.0*mmatrixSP+totalQualSP)
+      #qualityGroupParam = totalQualSP/(1.0*mmatrixSP+totalQualSP)
+
+
+      #denom = (1.0*lengthSP+donSP+accSP+mmatrixSP+totalQualSP)
+      #ratio1 = lengthSP+donSP+accSP / denom
+      #ratio2 = mmatrixSP+totalQualSP / denom 
+
+      #lengthGroupParam  *= ratio1
+      #spliceGroupParam  *= ratio1
+
+      #matchGroupParam   *= ratio2 
+      #qualityGroupParam *= ratio2 
 
-      beg = lengthSP
-      end = lengthSP+donSP+accSP
-      self.P[beg:end,beg:end]       *= spliceGroupParam
 
       beg = lengthSP+donSP+accSP
       end = lengthSP+donSP+accSP+mmatrixSP
@@ -120,7 +157,7 @@ class SIQP:
       beg = lengthSP+donSP+accSP+mmatrixSP
       self.P[beg:-self.numExamples,beg:-self.numExamples]   *= qualityGroupParam
 
-      self.P[0:-self.numExamples,0:-self.numExamples]       *= regC*1e-3
+      self.P[0:-self.numExamples,0:-self.numExamples]       *= regC
 
 
    def createRegularizer(self):
index 14b300c..31ff877 100644 (file)
@@ -21,7 +21,7 @@ import pdb
 #  y_i and y_i+1 parameters, so the fractions are saved in the weight vectors!
 
 def calculateWeights(plf, scores):
-   currentWeight = zeros((30,1))
+   currentWeight = zeros((len(plf.limits),1))
    
    for k in range(len(scores)):
       value = scores[k]
@@ -106,6 +106,7 @@ def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp):
    for pos in range(len(intron_starts)):
       values[pos] = intron_ends[pos] - intron_starts[pos] + 1
 
+   #print 'introns found: ' + str(values)
    weightIntron = calculateWeights(h,values)
 
    return weightDon, weightAcc, weightIntron
index 7bbb0da..0508a98 100644 (file)
@@ -5,7 +5,7 @@ import math
 import numpy.matlib
 import QPalmaDP
 import pdb
-import Configuration
+import Configuration as Conf
 from Plif import *
 
 def set_param_palma(param, train_with_intronlengthinformation,\
@@ -22,14 +22,14 @@ def set_param_palma(param, train_with_intronlengthinformation,\
    h = Plf()
    d = Plf()
    a = Plf()
-   qualityPlifs = [None]*Configuration.numQualPlifs
+   qualityPlifs = [None]*Conf.numQualPlifs
 
-   donSP       = Configuration.numDonSuppPoints
-   accSP       = Configuration.numAccSuppPoints
-   lengthSP    = Configuration.numLengthSuppPoints
-   mmatrixSP   = Configuration.sizeMatchmatrix[0]\
-   *Configuration.sizeMatchmatrix[1]
-   totalQualSP = Configuration.totalQualSuppPoints
+   donSP       = Conf.numDonSuppPoints
+   accSP       = Conf.numAccSuppPoints
+   lengthSP    = Conf.numLengthSuppPoints
+   mmatrixSP   = Conf.sizeMatchmatrix[0]\
+   *Conf.sizeMatchmatrix[1]
+   totalQualSP = Conf.totalQualSuppPoints
 
    ####################
    # Gapfunktion
@@ -77,21 +77,21 @@ def set_param_palma(param, train_with_intronlengthinformation,\
    # Matchmatrix
    ####################
    mmatrix = numpy.matlib.mat(param[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP])
-   mmatrix.reshape(Configuration.sizeMatchmatrix[0],Configuration.sizeMatchmatrix[1]) 
+   mmatrix.reshape(Conf.sizeMatchmatrix[0],Conf.sizeMatchmatrix[1]) 
 
    ####################
    # Quality Plifs
    ####################
-   for idx in range(Configuration.numQualPlifs):
+   for idx in range(Conf.numQualPlifs):
       currentPlif = Plf()
-      currentPlif.limits    = linspace(Configuration.min_qual,Configuration.max_qual,Configuration.numQualSuppPoints) 
-      begin                 = lengthSP+donSP+accSP+mmatrixSP+(idx*Configuration.numQualSuppPoints)
-      end                   = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Configuration.numQualSuppPoints)
+      currentPlif.limits    = linspace(Conf.min_qual,Conf.max_qual,Conf.numQualSuppPoints) 
+      begin                 = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
+      end                   = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
       currentPlif.penalties = param[begin:end].flatten().tolist()[0]
       currentPlif.transform = '' 
       currentPlif.name      = 'q' 
-      currentPlif.max_len   = Configuration.max_qual 
-      currentPlif.min_len   = Configuration.min_qual
+      currentPlif.max_len   = Conf.max_qual 
+      currentPlif.min_len   = Conf.min_qual
       qualityPlifs[idx] = currentPlif
 
    return h,d,a,mmatrix,qualityPlifs
index 1bfafd3..c4f3b88 100644 (file)
@@ -54,11 +54,7 @@ info = """
    - 'remappped_pos' - the position of the remapped read
    """
 
-def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file):
-
-   assert os.path.exists(gff_file)
-   #for file in dna_flat_files:
-   #   assert os.path.exists(file)
+def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
 
    assert os.path.exists(filtered_reads)
    assert os.path.exists(remapped_reads)
@@ -68,16 +64,25 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    joinp = os.path.join
 
    # first read the gff file(s) and create pickled gene information
-   allGenes = parse_gff(gff_file) #,joinp(tmp_dir,'gff_info.pickle'))
-   
-   print 'parsing filtered reads'
-   frp = FilteredReadParser(filtered_reads)
+   allGenes = [None]*6
 
-   print 'parsing remapped reads'
+   for i in range(1,6):
+      allGenes[i]= parse_gff(gff_file%i) #,joinp(tmp_dir,'gff_info.pickle'))
+
+   print 'parsing filtered reads..'
+   frp = FilteredReadParser(filtered_reads)
    all_filtered_reads = frp.parse()
+   print 'found %d filtered reads' % len(all_filtered_reads)
 
+   print 'parsing remapped reads'
    rrp = RemappedReadParser(remapped_reads)
    all_remapped_reads = rrp.parse()
+   if all_remapped_reads != None:
+      size_rem = len(all_remapped_reads)
+   else:
+      size_rem = 0
+
+   print 'found %d remapped reads' % size_rem
 
    # this stores the new dataset
    Sequences = []
@@ -88,6 +93,8 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    Qualities = []
    SplitPositions = []
    Ids = []
+
+   #pdb.set_trace()
    
    # Iterate over all remapped reads in order to generate for each read an
    # training / prediction example
@@ -99,10 +106,13 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
          currentRReads = None
 
       gene_id = currentFRead['gene_id']
-      currentGene = allGenes[gene_id]
+      chr = currentFRead['chr']
+      currentGene = allGenes[chr][gene_id]
+
+      if currentFRead['strand'] != '+':
+         continue
    
-      #seq, acc, don, exons, est, qual, spos = process_read(currentRReads,currentFRead,currentGene,genome_file,sscore_filename)
-      seq, acc, don, exons, est, qual, spos = process_read(currentRReads,currentFRead,currentGene)
+      seq, acc, don, exons, est, qual, spos = process_read(currentRReads,currentFRead,currentGene,dna_flat_files,test)
       
       if seq == '':
          continue
@@ -124,31 +134,32 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    io_pickle.save(dataset_file,dataset)
 
 
-#def process_read(reReads,fRead,currentGene,genome_file,sscore_filename):
-def process_read(reReads,fRead,currentGene):
+def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    """
    The purpose of this function is to create an example for QPalma 
    by using a 
 
    """
+   nil = ('','','','','','','')
+
    chr = fRead['chr']
    strand = fRead['strand']
 
    chrom             = 'chr%d' % chr
-   genome_file       = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
    sscore_filename   = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s' % (chr,strand)
 
    # use matlab-style functions to access dna sequence
-   currentSeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,genome_file,one_based=True)
+   currentSeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=True)
    currentSeq = currentSeq.lower()
 
-   if strand == '-':
-      currentSeq = reverse_complement(currentSeq)
-
-   nil = ('','','','','','','')
+   assert currentSeq != '', 'load_genomic returned empty sequence!'
 
-   if len(currentSeq) < (currentGene.stop - currentGene.start):
-      return nil
+   if strand == '-':
+      try:
+         currentSeq = reverse_complement(currentSeq)
+      except:
+         print 'invalid char'
+         return nil
 
    p_start     = fRead['p_start']
    exon_stop   = fRead['exon_stop']
@@ -159,9 +170,6 @@ def process_read(reReads,fRead,currentGene):
    quality = fRead['prb']
    spos = fRead['splitpos']
 
-   #if fRead['chr'] != 1 and fRead['strand'] != 'D':
-   #   return nil
-
    assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
    assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
    assert p_stop - p_start >= 36
@@ -183,8 +191,6 @@ def process_read(reReads,fRead,currentGene):
 
    # if we perform testing we cut a much wider region as we want to detect
    # how good we perform on a region
-   test = False
-
    if test:
       up_cut = up_cut-500
       if up_cut < 0:
@@ -209,9 +215,11 @@ def process_read(reReads,fRead,currentGene):
    try:
       if not (currentSeq[int(currentExons[0,1])] == 'g' and\
       currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
+         print 'not aligned'
          return nil
 
       if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
+         print 'not aligned'
          return nil
 
    except:
@@ -219,11 +227,11 @@ def process_read(reReads,fRead,currentGene):
 
    # now we want to use interval_query to get the predicted splice scores
    # trained on the TAIR sequence and annotation
-   
    interval_matrix = createIntArrayFromList([currentGene.start+up_cut-100,currentGene.start+down_cut+100])
    num_fields = 1
    num_intervals = 1
-   sscore_filename   = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_1+' 
+   sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
+   % (chr,strand)
 
    ag_tuple_pos = [p for p,e in enumerate(currentSeq) if p>0 and p<len(currentSeq)-1 and currentSeq[p-1]=='a' and e=='g' ]
 
@@ -253,7 +261,9 @@ def process_read(reReads,fRead,currentGene):
    del gf
 
    # fetch donor scores
-   sscore_filename   = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+' 
+   sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s'\
+   % (chr,strand)
+
    gf = Genef()
    num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
    assert num_hits <= len(currentSeq)
@@ -280,14 +290,6 @@ def process_read(reReads,fRead,currentGene):
 
    don = [-inf] + don[:-1]
 
-   #for p,e in enumerate(acc):
-   #   if p not in ag_tuple_pos:
-   #      acc[p] = -inf
-
-   #for p,e in enumerate(don):
-   #   if p not in gt_tuple_pos:
-   #      don[p] = -inf
-
    assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
    assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
 
@@ -308,7 +310,6 @@ def reverse_complement(seq):
 
    return new_seq
    
-
 if __name__ == '__main__':
    if len(sys.argv) == 1:
       print info
index 53d509a..1cf51b2 100644 (file)
@@ -52,7 +52,10 @@ class QPalma:
       self.ARGS = Param()
 
       #from compile_dataset import compile_d
-      #compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn)
+      #compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,False)
+
+      from compile_dataset import compile_d
+      compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,True)
 
       #gen_file= '%s/genome.config' % self.ARGS.basedir
       #ginfo_filename = 'genome_info.pickle'
@@ -104,6 +107,20 @@ class QPalma:
 
          print 'Donor max/min: %f / %f' % (max_score,min_score)
 
+         min_score = 10000
+         max_score = 0
+         mean_intron_len = 0
+
+         for elem in self.Exons:
+            intron_len = int(elem[1,0] - elem[0,1])
+            mean_intron_len += intron_len
+            min_score = min(min_score,intron_len)
+            max_score = max(max_score,intron_len)
+
+         mean_intron_len /= 1.0*len(self.Exons)
+         print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
+
+
          self.use_quality_scores = True
       else:
          assert(False)
@@ -146,6 +163,8 @@ class QPalma:
       # Set the parameters such as limits penalties for the Plifs
       [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
 
+      pdb.set_trace()
+
       # Initialize solver 
       if Conf.USE_OPT:
          self.plog('Initializing problem...\n')
@@ -200,17 +219,19 @@ class QPalma:
             # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
             trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
             
-            #print 'trueWeights' 
             # Calculate the weights
             trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
             trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
 
+            #idx_lst = [p for p,e in enumerate(trueWeightIntron) if e > 1e-5]
+            #print idx_lst
+            #print [h.limits[p] for p in idx_lst]
+
             currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
             currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
             currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
             currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
 
-            #pdb.set_trace()
             if Conf.mode == 'using_quality_scores':
                totalQualityPenalties = param[-totalQualSP:]
                currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                    = totalQualityPenalties[:]
@@ -315,7 +336,6 @@ class QPalma:
             for i in range(num_path[exampleIdx]):
                newDPScores[i] = c_DPScores[i]
 
-
             if self.use_quality_scores:
                for i in range(Conf.totalQualSuppPoints*num_path[exampleIdx]):
                   newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
@@ -367,6 +387,8 @@ class QPalma:
                wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
                allWeights[:,pathNr+1] = wp
 
+               #pdb.set_trace()
+
                hpen = mat(h.penalties).reshape(len(h.penalties),1)
                dpen = mat(d.penalties).reshape(len(d.penalties),1)
                apen = mat(a.penalties).reshape(len(a.penalties),1)
@@ -381,11 +403,10 @@ class QPalma:
                if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
                   distinct_scores = True
                
-               #if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) + [0,1][distinct_scores and (pathNr>0)] <= 1e-5:
                if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
                   pdb.set_trace()
 
-               #  # if the pathNr-best alignment is very close to the true alignment consider it as true
+               # if the pathNr-best alignment is very close to the true alignment consider it as true
                if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
                   true_map[pathNr+1] = 1
 
@@ -397,7 +418,7 @@ class QPalma:
                if len([elem for elem in true_map if elem == 1]) == len(true_map):
                   num_path[exampleIdx] = num_path[exampleIdx]+1
 
-               # Choose true and first false alignment for extending A
+               # Choose true and first false alignment for extending
                firstFalseIdx = -1
                for map_idx,elem in enumerate(true_map):
                   if elem == 0:
@@ -485,9 +506,6 @@ class QPalma:
    def predict(self,param_filename,beg,end):
       self.logfh = open('_qpalma_predict.log','w+')
 
-      beg = Conf.prediction_begin
-      end = Conf.prediction_end
-
       Sequences   = self.Sequences[beg:end]
       Exons       = self.Exons[beg:end]
       Ests        = self.Ests[beg:end]
diff --git a/tools/data_tools/filterReads.py b/tools/data_tools/filterReads.py
new file mode 100644 (file)
index 0000000..71646c5
--- /dev/null
@@ -0,0 +1,22 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*- 
+
+import sys
+import pdb
+
+import qpalma
+from qpalma.tools.parseGff import parse_gff
+
+def filter(gff_file,reads,out):
+   allGenes = parse_gff(gff_file)
+
+   out_fh = open(out,'w+')
+
+
+   out_fh.close()
+
+if __name__ == '__main__':
+   assert len(sys.argv) == 4, 'Usage: '
+   (gff_file,reads,out) = sys.argv[1:4]
+   filter(gff_file,reads,out)
+