+ fixed nasty index bugs in the when creating the label of the ground truth
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 12 Mar 2008 14:27:48 +0000 (14:27 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 12 Mar 2008 14:27:48 +0000 (14:27 +0000)
+ computeWeights is now happy -> follow the convention acceptor scores are
located at the 'g' of 'ag'

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8133 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/compile_dataset.py
scripts/qpalma_main.py

index d46e3ef..f2af910 100644 (file)
@@ -421,14 +421,21 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
 
    currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
 
-   currentAcc = currentAcc[100:-100]
+   currentAcc = currentAcc[100:-98]
+   currentAcc = currentAcc[1:]
    currentDon = currentDon[100:-100]
 
-   ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq)-1 and genomicSeq[p-2]=='a' and genomicSeq[p-1]=='g' ]
+   length = len(genomicSeq)
+   currentAcc = currentAcc[:length]
+
+   currentDon = currentDon+[-inf]*(length-len(currentDon))
+
+   ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
    gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
 
    assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
    assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
+   assert len(currentAcc) == len(currentDon)
 
    return genomicSeq, currentAcc, currentDon
 
index 0da4554..cd0b8cd 100644 (file)
@@ -28,7 +28,10 @@ from numpy.linalg import norm
 
 import QPalmaDP
 import qpalma
-from qpalma.SIQP_CPX import SIQPSolver
+
+#from qpalma.SIQP_CPX import SIQPSolver
+from qpalma.SIQP_CVXOPT import SIQPSolver
+
 from qpalma.DataProc import *
 from qpalma.computeSpliceWeights import *
 from qpalma.set_param_palma import *
@@ -267,7 +270,9 @@ class QPalma:
 
       # Initialize solver 
       self.plog('Initializing problem...\n')
+
       solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
+      #solver = None
 
       #solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
       #solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
@@ -312,24 +317,23 @@ class QPalma:
             est = unbracket_est(est)
             est = est.replace('-','')
 
+            assert len(est) == run['read_size'], pdb.set_trace()
+            est_len = len(est)
+
             original_est = OriginalEsts[exampleIdx] 
             original_est = "".join(original_est)
             original_est = original_est.lower()
 
-            est_len = len(est)
-
-            dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+            dna_flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
             dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
-
-            # check whether the whole business is really related to invalid
-            # splice scores
-
-            acc_supp = [0.0]*len(acc_supp)
-            don_supp = [0.0]*len(don_supp)
-
             dna_len = len(dna)
 
-            assert len(est) == run['read_size'], pdb.set_trace()
+            # splice score is located at g of ag
+            ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and dna[p-1]=='a' and dna[p]=='g' ]
+            assert ag_tuple_pos == [p for p,e in enumerate(acc_supp) if e != -inf and p > 1], pdb.set_trace()
+
+            gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')]
+            assert gt_tuple_pos == [p for p,e in enumerate(don_supp) if e != -inf and p > 0], pdb.set_trace()
 
             if run['mode'] == 'normal':
                quality = [40]*len(est)
@@ -340,8 +344,13 @@ class QPalma:
             if not run['enable_quality_scores']:
                quality = [40]*len(est)
 
-            exons = Exons[exampleIdx] 
-            exons -= (up_cut-1)
+            # check whether the whole business is really related to invalid
+            # splice scores
+            #acc_supp = [0.0]*len(acc_supp)
+            #don_supp = [0.0]*len(don_supp)
+
+            original_exons = Exons[exampleIdx] 
+            exons = original_exons - (up_cut-1)
             exons[0,0] -= 1
 
             fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
@@ -349,10 +358,10 @@ class QPalma:
             donor_elem = dna[exons[0,1]:exons[0,1]+2]
             acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
 
-            if not ( donor_elem == 'gt' or donor_elem == 'gc'):
+            if not ( donor_elem == 'gt' or donor_elem == 'gc' ):
                print 'invalid donor in example %d'% exampleIdx
 
-            if not ( acceptor_elem == 'ag'):
+            if not ( acceptor_elem == 'ag' ):
                print 'invalid acceptor in example %d'% exampleIdx
    
             assert len(fetched_dna_subseq) == len(est), pdb.set_trace()
@@ -379,6 +388,9 @@ class QPalma:
                   if elem != -inf:
                      acc_supp[idx] = 0.0
 
+            #pdb.set_trace()
+            #continue
+
             # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
             if run['mode'] == 'using_quality_scores':
                trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
@@ -424,7 +436,6 @@ class QPalma:
             #myalign wants the acceptor site on the g of the ag
             #acceptor = acceptor[1:]
             #acceptor.append(-inf)
-            acceptor.append(0)
 
             #donor = [-inf] + donor[:-1]
             #pdb.set_trace()
@@ -434,7 +445,6 @@ class QPalma:
             _newSpliceAlign, _newEstAlign, _newWeightMatch, _newDPScores,\
             _newQualityPlifsFeatures, unneeded1, unneeded2 =\
             self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False)
-
             mm_len = run['matchmatrixRows']*run['matchmatrixCols']
 
             # old code removed
@@ -465,7 +475,7 @@ class QPalma:
                weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
                h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
                acc_supp)
-
+              
                decodedQualityFeatures = zeros((run['totalQualSuppPoints'],1))
                decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
                # Gewichte in restliche Zeilen der Matrix speichern
@@ -549,7 +559,9 @@ class QPalma:
                   differenceVector  = trueWeight - firstFalseWeights
                   #pdb.set_trace()
 
+                  #print 'NOT ADDING ANY CONSTRAINTS'
                   const_added = solver.addConstraint(differenceVector, exampleIdx)
+
                   const_added_ctr += 1
                #
                # end of one example processing