+ training works,
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 14 Mar 2008 17:29:26 +0000 (17:29 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 14 Mar 2008 17:29:26 +0000 (17:29 +0000)
+ prediction however has still some issues with the correct dna sequence

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8160 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/Evaluation.py
scripts/check_dataset.py
scripts/compile_dataset.py
scripts/qpalma_main.py
scripts/resurrect

index 706d052..b02ee9c 100644 (file)
@@ -234,8 +234,8 @@ if __name__ == '__main__':
    dir = sys.argv[1]
    assert os.path.exists(dir), 'Error directory does not exist!'
 
-   #forall_experiments(perform_prediction,dir)
-   forall_experiments(collect_prediction,dir)
+   forall_experiments(perform_prediction,dir)
+   #forall_experiments(collect_prediction,dir)
 
 """
          if elem_nr > 0:
index 2275270..bb81a62 100644 (file)
@@ -19,6 +19,9 @@ def checkAll(filename):
       currentInfo = SeqInfo[idx]
       chr,strand,p1,p2 = currentInfo
       currentExon = Exons[idx]
+      currentExon[0,1] += 1
+      currentExon -= 1
+
       currentEst  = OriginalEsts[idx]
       originalEst  = OriginalEsts[idx]
 
index f2af910..8ac6fa6 100644 (file)
@@ -161,7 +161,7 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
          if instance_counter % 1000 == 0:
             print 'processed %d examples' % instance_counter
 
-         if instance_counter == 20000:
+         if instance_counter == 60000:
             break
 
    print 'Full dataset has size %d' % len(SeqInfo)
index cd0b8cd..4180006 100644 (file)
@@ -48,7 +48,6 @@ from qpalma.Configuration import *
 # functions
 from Genefinding import *
 from genome_utils import load_genomic
-
 from Utils import calc_stat, calc_info, pprint_alignment
 
 
@@ -103,6 +102,8 @@ class QPalma:
       dna_len = len(dna)
       est_len = len(est)
 
+      #pdb.set_trace()
+
       prb = QPalmaDP.createDoubleArrayFromList(quality)
       chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
 
@@ -349,12 +350,13 @@ class QPalma:
             #acc_supp = [0.0]*len(acc_supp)
             #don_supp = [0.0]*len(don_supp)
 
-            original_exons = Exons[exampleIdx] 
+            original_exons = Exons[exampleIdx]
             exons = original_exons - (up_cut-1)
             exons[0,0] -= 1
+            exons[1,0] -= 1
 
             fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
-
+            
             donor_elem = dna[exons[0,1]:exons[0,1]+2]
             acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
 
@@ -401,7 +403,7 @@ class QPalma:
 
             dna_calc = dna_calc.replace('-','')
 
-            print 'right before computeSpliceWeights exampleIdx %d' % exampleIdx
+            #print 'right before computeSpliceWeights exampleIdx %d' % exampleIdx
             # Calculate the weights
             trueWeightDon, trueWeightAcc, trueWeightIntron =\
             computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
@@ -471,7 +473,7 @@ class QPalma:
 
             for pathNr in range(num_path[exampleIdx]):
                #print 'decodedWeights' 
-               print 'right before computeSpliceWeights(2) exampleIdx %d' % exampleIdx
+               #print 'right before computeSpliceWeights(2) exampleIdx %d' % exampleIdx
                weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
                h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
                acc_supp)
@@ -636,20 +638,14 @@ class QPalma:
       end = run['prediction_end']
 
       data_filename = self.run['dataset_filename']
-      Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities,\
-      UpCut, StartPos, AlternativeSequences=\
-      paths_load_data(data_filename,'training',None,self.ARGS)
 
-      self.Sequences   = Sequences
+      SeqInfo, Exons, OriginalEsts, Qualities,\
+      AlternativeSequences = paths_load_data(data_filename,'training',None,self.ARGS)
+
+      self.SeqInfo     = SeqInfo
       self.Exons       = Exons
-      self.Ests        = Ests
       self.OriginalEsts= OriginalEsts
       self.Qualities   = Qualities
-      self.Donors      = Donors
-      self.Acceptors   = Acceptors
-      self.UpCut       = UpCut
-      self.StartPos    = StartPos
-
       self.AlternativeSequences = AlternativeSequences
 
       #calc_info(self.Acceptors,self.Donors,self.Exons,self.Qualities)
@@ -685,23 +681,18 @@ class QPalma:
       else:
          assert(False)
 
-      Sequences   = self.Sequences[beg:end]
+      SeqInfo     = self.SeqInfo[beg:end]
       Exons       = self.Exons[beg:end]
-      Ests        = self.Ests[beg:end]
-      OriginalEsts        = self.OriginalEsts[beg:end]
+      OriginalEsts= self.OriginalEsts[beg:end]
       Qualities   = self.Qualities[beg:end]
-      Acceptors   = self.Acceptors[beg:end]
-      Donors      = self.Donors[beg:end]
-      UpCut       = self.UpCut[beg:end]
-      StartPos    = self.StartPos[beg:end]
-
       AlternativeSequences = self.AlternativeSequences[beg:end]
 
       # number of training instances
-      N = numExamples = len(Sequences) 
-      assert len(Exons) == N and len(Ests) == N\
-      and len(Qualities) == N and len(Acceptors) == N\
-      and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
+      N = numExamples = len(SeqInfo) 
+      assert len(Exons) == N and len(OriginalEsts) == N\
+      and len(Qualities) == N and len(AlternativeSequences) == N,\
+      'The Exons,Acc,Don,.. arrays are of different lengths'
+
       self.plog('Number of training examples: %d\n'% numExamples)
 
       self.noImprovementCtr = 0
@@ -738,40 +729,80 @@ class QPalma:
       for exampleIdx in range(numExamples):
          self.plog('Loading example nr. %d...\n'%exampleIdx)
 
-         dna = Sequences[exampleIdx] 
-         est = Ests[exampleIdx] 
-
-         new_est = unbracket_est(est)
+         currentSeqInfo = SeqInfo[exampleIdx]
+         chr,strand,up_cut,down_cut = currentSeqInfo 
 
+         est = OriginalEsts[exampleIdx] 
+         est = "".join(est)
+         est = est.lower()
+         est = unbracket_est(est)
+         est = est.replace('-','')
 
-         exons = Exons[exampleIdx]
+         assert len(est) == run['read_size'], pdb.set_trace()
+         est_len = len(est)
 
-         current_up_cut = UpCut[exampleIdx]
+         original_est = OriginalEsts[exampleIdx] 
+         original_est = "".join(original_est)
+         original_est = original_est.lower()
 
-         current_start_pos = StartPos[exampleIdx]
+         dna_flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+         dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
+         dna_len = len(dna)
 
-         currentAlternatives = AlternativeSequences[exampleIdx]
+         # splice score is located at g of ag
+         ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and dna[p-1]=='a' and dna[p]=='g' ]
+         assert ag_tuple_pos == [p for p,e in enumerate(acc_supp) if e != -inf and p > 1], pdb.set_trace()
 
-         #est = est.replace('-','')
-         #original_est = OriginalEsts[exampleIdx] 
-         #original_est = "".join(original_est)
-         #original_est = original_est.lower()
-         #currentSplitPos = SplitPos[exampleIdx]
+         gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')]
+         assert gt_tuple_pos == [p for p,e in enumerate(don_supp) if e != -inf and p > 0], pdb.set_trace()
 
-         if self.run['mode'] == 'normal':
+         if run['mode'] == 'normal':
             quality = [40]*len(est)
 
-         if self.run['mode'] == 'using_quality_scores':
-            quality = Qualities[exampleIdx]
+         if run['mode'] == 'using_quality_scores':
+            quality = Qualities[exampleIdx][0]
 
          if not run['enable_quality_scores']:
             quality = [40]*len(est)
 
-         don_supp = Donors[exampleIdx] 
-         acc_supp = Acceptors[exampleIdx] 
+         # check whether the whole business is really related to invalid
+         # splice scores
+         #acc_supp = [0.0]*len(acc_supp)
+         #don_supp = [0.0]*len(don_supp)
 
-         if not run['enable_splice_signals']:
+         original_exons = Exons[exampleIdx]
+         exons = original_exons - (up_cut-1)
+         exons[0,0] -= 1
+         exons[1,0] -= 1
+
+         fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
+         
+         donor_elem = dna[exons[0,1]:exons[0,1]+2]
+         acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
+
+         if not ( donor_elem == 'gt' or donor_elem == 'gc' ):
+            print 'invalid donor in example %d'% exampleIdx
+
+         if not ( acceptor_elem == 'ag' ):
+            print 'invalid acceptor in example %d'% exampleIdx
+
+         assert len(fetched_dna_subseq) == len(est), pdb.set_trace()
+
+         new_string = ''
+         for idx in range(len(est)):
+            dna_char = fetched_dna_subseq[idx] 
+            est_char = est[idx]
+            if dna_char == est_char:
+               new_string += est_char
+            else:
+               new_string += '[%s%s]'%(dna_char,est_char)
 
+         if new_string != original_est:
+            print new_string,original_est
+            print exampleIdx
+            continue
+
+         if not run['enable_splice_signals']:
             for idx,elem in enumerate(don_supp):
                if elem != -inf:
                   don_supp[idx] = 0.0
@@ -780,12 +811,13 @@ class QPalma:
                if elem != -inf:
                   acc_supp[idx] = 0.0
 
+         currentAlternatives = AlternativeSequences[exampleIdx]
          current_example_predictions = []
 
          # first make a prediction on the dna fragment which comes from the ground truth                  
          current_prediction = self.calc_alignment(dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs)
          current_prediction['exampleIdx'] = exampleIdx
-         current_prediction['start_pos']  = current_start_pos
+         current_prediction['start_pos']  = up_cut
          current_prediction['label'] = True
 
          current_example_predictions.append(current_prediction)
@@ -808,7 +840,7 @@ class QPalma:
             current_prediction = self.calc_alignment(currentDNASeq, est, exons,\
             quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
             current_prediction['exampleIdx'] = exampleIdx
-            current_prediction['start_pos'] = current_start_pos
+            current_prediction['start_pos'] = up_cut
             current_prediction['alternative_start_pos'] = genomicSeq_start
             current_prediction['label'] = currentLabel
 
@@ -827,13 +859,8 @@ class QPalma:
       """
 
       run = self.run
-
       donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
 
-      #myalign wants the acceptor site on the g of the ag
-      acceptor = acceptor[1:]
-      acceptor.append(-inf)
-
       dna = str(dna)
       est = str(est)
       dna_len = len(dna)
@@ -841,22 +868,13 @@ class QPalma:
 
       ps = h.convert2SWIG()
 
-      __newSpliceAlign, __newEstAlign, __newWeightMatch, __newDPScores,\
-      __newQualityPlifsFeatures, __dna_array, __est_array =\
+      newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
+      newQualityPlifsFeatures, dna_array, est_array =\
       self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
 
       mm_len = run['matchmatrixRows']*run['matchmatrixCols']
 
       # old code removed
-
-      newSpliceAlign = __newSpliceAlign
-      newEstAlign    = __newEstAlign
-      newWeightMatch = __newWeightMatch
-      newDPScores    = __newDPScores
-      newQualityPlifsFeatures = __newQualityPlifsFeatures
-      dna_array = __dna_array
-      est_array = __est_array
-
       newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
       newWeightMatch = newWeightMatch.reshape(1,mm_len)
       true_map = [0]*2
index cf8f27a..3e7e980 100755 (executable)
@@ -3,7 +3,6 @@
 instance=$1
 
 source $HOME/.bashrc
-export ILOG_LICENSE_FILE=/fml/ag-raetsch/share/software/ilog/ilm/access2.ilm
 
 #python -c "import sys; from cPickle import *; ma=load(open('lmm_${instance}.pickle'))
 #