+ minor mods
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 23 Jan 2008 15:13:40 +0000 (15:13 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 23 Jan 2008 15:13:40 +0000 (15:13 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7545 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/DataProc.py
scripts/qpalma_predict.py
tools/data_tools/filterReads.c
tools/splicesites.py

index 158ed0c..12058c0 100644 (file)
@@ -53,7 +53,9 @@ def paths_load_data_solexa(expt,genome_info,PAR):
       #Acceptors.append([-inf]*currentSize)
       #Donors.append([-inf]*currentSize)
 
-      exon_idx = int(exon_idx)-1
+      exon_idx = int(exon_idx)
+
+      assert exon_idx > 0
       #print "%d " % exon_idx,
       currentExons = zeros((2,2))
       #for idx in range(len(currentGene.exons)):
@@ -71,15 +73,19 @@ def paths_load_data_solexa(expt,genome_info,PAR):
 
          up_cut   = int(currentExons[0,0]) - cut_offset
          down_cut = int(currentExons[1,1]) + cut_offset
-
          if up_cut < 0:
             up_cut = 0
 
          if down_cut > len(currentSeq):
             down_cut = len(currentSeq)
 
+         assert down_cut - up_cut > 0
+         assert down_cut - up_cut <= len(currentSeq)
+
          currentSeq = currentSeq[up_cut:down_cut]
 
+         #pdb.set_trace()
+
          if up_cut > 0:
             currentExons[0,0] -= up_cut 
             currentExons[0,1] -= up_cut
@@ -87,10 +93,11 @@ def paths_load_data_solexa(expt,genome_info,PAR):
             currentExons[1,0] -= up_cut
             currentExons[1,1] -= up_cut
             
-
       except:
-         pdb.set_trace()
+         continue
+         #pdb.set_trace()
 
+      #pdb.set_trace()
       Sequences.append(currentSeq)
 
       currentSize = len(Sequences[-1])
index a127cb0..dc10256 100644 (file)
@@ -74,7 +74,7 @@ class QPalma:
 
          #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
          #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
-         pdb.set_trace()
+         #pdb.set_trace()
          #Qualities = []
          #for i in range(len(Ests)):
          #   Qualities.append([40]*len(Ests[i]))
@@ -96,7 +96,7 @@ class QPalma:
 
       # Initialize parameter vector  / param = numpy.matlib.rand(126,1)
       #param_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/python/elegans.param'
-      param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_16.pickle'
+      param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_82.pickle'
       param = load_param(param_filename)
 
       # Set the parameters such as limits penalties for the Plifs
@@ -121,7 +121,11 @@ class QPalma:
       currentPhi = zeros((Conf.numFeatures,1))
       totalQualityPenalties = zeros((totalQualSP,1))
 
-      for exampleIdx in range(self.numExamples):
+      total_up_off = []
+      total_down_off = []
+
+      #for exampleIdx in range(self.numExamples):
+      for exampleIdx in range(200):
 
          dna = Sequences[exampleIdx] 
          est = Ests[exampleIdx] 
@@ -288,15 +292,34 @@ class QPalma:
          AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
 
          # Check wether scalar product + loss equals viterbi score
-         print '%f vs %f' % (newDPScores, AlignmentScores[pathNr+1])
+         print '%f vs. %f' % (newDPScores, AlignmentScores[pathNr+1])
 
          # if the pathNr-best alignment is very close to the true alignment consider it as true
          if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
             true_map[pathNr+1] = 1
 
-         pdb.set_trace()
+         #pdb.set_trace()
       
-         evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
+         up_off,down_off = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
+
+         if up_off > -1:
+            total_up_off.append(up_off)
+            total_down_off.append(down_off)
+
+      total_up = 0
+      total_down = 0 
+      for idx in range(len(total_up_off)):
+         total_up    += total_up_off[idx]
+         total_down  += total_down_off[idx]
+         
+      total_up /= len(total_up_off)
+      total_down /= len(total_down_off)
+
+      print 'Mean up_off is %f' % total_up
+      print 'Mean down_off is %f' % total_down
+      #print total_up_off
+      #print total_down_off
+      print 'len is %d' % len(total_up_off)
 
       print 'Prediction completed'
       self.logfh.close()
@@ -373,7 +396,20 @@ def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
       if oldElem == 0 and elem != 0:
          newExons.append(pos-1)
 
-   pdb.set_trace()
+   #pdb.set_trace()
+   up_off   = -1
+   down_off = -1
+
+   if len(newExons) != 4:
+      acc = 0.0
+   else:
+      e1_begin,e1_end = newExons[0],newExons[1]
+      e2_begin,e2_end = newExons[2],newExons[3]
+
+      up_off   = int(math.fabs(e1_end - exons[0,1]))
+      down_off = int(math.fabs(e2_begin - exons[1,0]))
+
+   return up_off,down_off
 
 if __name__ == '__main__':
    qpalma = QPalma()
index eb14ff4..ccefb19 100644 (file)
@@ -259,14 +259,13 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          //printf("exon %d %d inton til %d pos %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
             
          if (cur_exon_start - prev_exon_stop < min_overlap || cur_exon_start < pos ) { // go to next exon
-            exon_idx++;
-
             if (ue != 0 && dov != 0)
-               combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id,exon_idx-1);
+               combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id,exon_idx);
 
             if (uo != 0 && ds != 0) 
-               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id,exon_idx-1);
+               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id,exon_idx);
 
+            exon_idx++;
             ue = uo = ds = dov = 0;
             goto exon_label;
          }
index 1112c78..7750f75 100644 (file)
@@ -10,6 +10,11 @@ from shogun.Classifier import SVM
 from shogun.Features import StringCharFeatures,DNA
 from shogun.Kernel import WeightedDegreeStringKernel
 
+import cPickle
+
+from qpalma.DataProc import *
+from qpalma.TrainingParam import Param
+
 
 class splicesites():
     """splice site scores (both donor and acceptor)"""
@@ -353,10 +358,6 @@ def dict2seqlist(dic):
 def example():
     """a test example"""
     
-    import cPickle
-    
-    from paths_load_data_solexa import paths_load_data_solexa
-    from TrainingParam import Param
 
     filenames = {'data_dir':'/fml/ag-raetsch/share/projects/palma/param_files/',\
                  'param':'arabidopsis_ath1_ss=1_il=1.param.bz2',
@@ -368,10 +369,10 @@ def example():
     ss.load_model(filenames, parameters)    
 
     ARGS = Param()
-    ginfo_filename = 'genome_info.pickle'
+    ginfo_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/qpalma/genome_info.pickle'
     genome_info = cPickle.load(open(ginfo_filename))
     
-    (Sequences, Acceptors, Donors, Exons, Ests, Qualities) \
+    (Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos) \
                 = paths_load_data_solexa('training',genome_info,ARGS)
 
     dna = dict((str(ix),Sequences[ix]) for ix in xrange(len(Sequences)))