+ fixed automatic Python path settings in Makefiles
[qpalma.git] / qpalma / qpalma_main.py
index f4b8c89..b856505 100644 (file)
@@ -17,7 +17,7 @@ from numpy.matlib import mat,zeros,ones,inf
 from numpy.linalg import norm
 
 #from qpalma.SIQP_CPX import SIQPSolver
-#from qpalma.SIQP_CVXOPT import SIQPSolver
+from qpalma.SIQP_CVXOPT import SIQPSolver
 
 import QPalmaDP
 import qpalma
@@ -33,39 +33,29 @@ from qpalma.utils import pprint_alignment, get_alignment
 jp = os.path.join
 dotProd = lambda x,y: (x.T * y)[0,0]
 
+
 class SpliceSiteException:
    pass
 
 
-def getData(training_set,exampleKey,settings):
-   """ This function...  """
+def preprocessExample(training_set,exampleKey,seqInfo,settings):
+   """ 
+   This function...  
+   """
 
-   currentSeqInfo,original_est,currentQualities,currentExons = training_set[exampleKey]
+   currentSeqInfo,originalRead,currentQualities,currentExons = training_set[exampleKey]
    id,chr,strand,up_cut,down_cut = currentSeqInfo
 
-   est = original_est
-   est = "".join(est)
-   est = est.lower()
-   est = unbracket_est(est)
-   est = est.replace('-','')
-
-   est_len = len(est)
+   read = unbracket_seq(originalRead)
+   read = read.replace('-','')
 
-   original_est = "".join(original_est)
-   original_est = original_est.lower()
+   dna, acc_supp, don_supp = seqInfo.get_seq_and_scores(chr,strand,up_cut,down_cut)
 
-   dna_flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-   dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
-
-   #currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
-
-   original_exons = currentExons
-   exons = original_exons - (up_cut-1)
-   exons[0,0] -= 1
-   exons[1,0] -= 1
+   exons = currentExons - up_cut
 
    if exons.shape == (2,2):
       fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
+      print fetched_dna_subseq
      
       donor_elem = dna[exons[0,1]:exons[0,1]+2]
       acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
@@ -78,9 +68,53 @@ def getData(training_set,exampleKey,settings):
          print 'invalid acceptor in example %d'% exampleKey
          raise SpliceSiteException
 
-      assert len(fetched_dna_subseq) == len(est), pdb.set_trace()
+      assert len(fetched_dna_subseq) == len(read), pdb.set_trace()
+
+   return dna,read,acc_supp,don_supp,exons,originalRead,currentQualities
+
+
+def performAlignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode,settings):
+   """
+   Given the needed input this method calls the QPalma C module which
+   calculates a dynamic programming in order to obtain an alignment
+   """
+
+   prb = QPalmaDP.createDoubleArrayFromList(quality)
+   chastity = QPalmaDP.createDoubleArrayFromList([.0]*len(read))
+
+   matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
+   mm_len = settings['matchmatrixRows']*settings['matchmatrixCols']
+
+   don_len = len(donor)
+   donor = QPalmaDP.createDoubleArrayFromList(donor)
+   acc_len = len(acceptor)
+   acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
+
+   # Create the alignment object representing the interface to the C/C++ code.
+   currentAlignment = QPalmaDP.Alignment(settings['numQualPlifs'],settings['numQualSuppPoints'], settings['enable_quality_scores'])
+
+   c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
+
+   # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
+   currentAlignment.myalign( current_num_path, dna, len(dna),\
+    read, len(read), prb, chastity, ps, matchmatrix, mm_len, donor, don_len,\
+    acceptor, acc_len, c_qualityPlifs, settings['remove_duplicate_scores'],\
+    settings['print_matrix'] )
+
+   if prediction_mode:
+      # part that is only needed for prediction
+      result_len = currentAlignment.getResultLength()
+      dna_array,read_array = currentAlignment.getAlignmentArraysNew()
+   else:
+      dna_array = None
+      read_array = None
+
+   newSpliceAlign, newEstAlign, newWeightMatch, newDPScores, newQualityPlifsFeatures =\
+   currentAlignment.getAlignmentResultsNew()
+
+   return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
+   newQualityPlifsFeatures, dna_array, read_array
 
-   return dna,est,acc_supp,don_supp,exons,original_est,currentQualities
 
 
 class QPalma:
@@ -93,59 +127,19 @@ class QPalma:
       self.ARGS = Param()
       self.qpalma_debug_mode = dmode
       self.seqInfo = seqInfo
+      self.logfh = None
 
 
    def plog(self,string):
-      self.logfh.write(string)
-      self.logfh.flush()
-
-
-   def do_alignment(self,dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode,settings):
-      """
-      Given the needed input this method calls the QPalma C module which
-      calculates a dynamic programming in order to obtain an alignment
-      """
-
-      dna_len = len(dna)
-      est_len = len(est)
-
-      prb = QPalmaDP.createDoubleArrayFromList(quality)
-      chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
-
-      matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
-      mm_len = settings['matchmatrixRows']*settings['matchmatrixCols']
-
-      d_len = len(donor)
-      donor = QPalmaDP.createDoubleArrayFromList(donor)
-      a_len = len(acceptor)
-      acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
-
-      # Create the alignment object representing the interface to the C/C++ code.
-      currentAlignment = QPalmaDP.Alignment(settings['numQualPlifs'],settings['numQualSuppPoints'], settings['enable_quality_scores'])
-      c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
-      # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
-      currentAlignment.myalign( current_num_path, dna, dna_len,\
-       est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
-       acceptor, a_len, c_qualityPlifs, settings['remove_duplicate_scores'],\
-       settings['print_matrix'] )
-
-      if prediction_mode:
-         # part that is only needed for prediction
-         result_len = currentAlignment.getResultLength()
-         dna_array,est_array = currentAlignment.getAlignmentArraysNew()
+      if self.logfh != None:
+         self.logfh.write(string)
+         self.logfh.flush()
       else:
-         dna_array = None
-         est_array = None
-
-      newSpliceAlign, newEstAlign, newWeightMatch, newDPScores, newQualityPlifsFeatures =\
-      currentAlignment.getAlignmentResultsNew()
-
-      return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
-      newQualityPlifsFeatures, dna_array, est_array
+         print string
 
 
-   def init_train(self,settings,run_name):
-      full_working_path = jp(settings['alignment_dir'],run_name)
+   def init_training(self,dataset_fn,training_keys,settings,set_name):
+      full_working_path = jp(settings['training_dir'],run_name)
 
       #assert not os.path.exists(full_working_path)
       if not os.path.exists(full_working_path):
@@ -158,10 +152,12 @@ class QPalma:
 
       self.logfh = open('_qpalma_train.log','w+')
 
-      train(settings)
+      training_set = cPickle.load(open(dataset_fn))
 
+      self.train(training_set,settings,set_name)
 
-   def train(self,settings,training_set):
+
+   def train(self,training_set,settings,set_name):
       """
       The mainloop for training.
       """
@@ -183,11 +179,14 @@ class QPalma:
       totalQualSP = settings['totalQualSuppPoints']
 
       # calculate the total number of features
-      numFeatures = lengthSP+donSP+accSP+mmatrixSP*numq
+      numFeatures = settings['numFeatures']
 
       # Initialize parameter vector
       param = numpy.matlib.rand(numFeatures,1)
 
+      # we take the first quality vector of the tuple of quality vectors
+      quality_index = 0
+
       # no intron length model
       if not settings['enable_intron_length']:
          param[:lengthSP] *= 0.0
@@ -202,13 +201,14 @@ class QPalma:
          solver = SIQPSolver(numFeatures,numExamples,settings['C'],self.logfh,settings)
       except:
          self.plog('Got no license. Telling queue to reschedule job...\n')
+         self.plog(sys.exc_info())
          sys.exit(99)
 
-      solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
-      solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
+      #solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
+      #solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
 
       # stores the number of alignments done for each example (best path, second-best path etc.)
-      num_path = settings['anzpath']*numExamples
+      num_path = [settings['anzpath']]*numExamples
 
       currentPhi = zeros((numFeatures,1))
       totalQualityPenalties = zeros((totalQualSP,1))
@@ -229,12 +229,13 @@ class QPalma:
          if iteration_nr == settings['iter_steps']:
             break
 
+         #print 'Got %d examples' % len(training_set.keys())
          for exampleIdx,example_key in enumerate(training_set.keys()):
             print 'Current example %d' % example_key
 
             try:
-               dna,est,acc_supp,don_supp,exons,original_est,currentQualities =\
-               getData(training_set,example_key,settings)
+               dna,read,acc_supp,don_supp,exons,original_read,currentQualities =\
+               preprocessExample(training_set,example_key,self.seqInfo,settings)
             except SpliceSiteException:
                continue
 
@@ -252,17 +253,17 @@ class QPalma:
                   if elem != -inf:
                      acc_supp[idx] = 0.0
 
-
             # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
             if settings['enable_quality_scores']:
                trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
-               computeSpliceAlignWithQuality(dna, exons, est, original_est,\
+               computeSpliceAlignWithQuality(dna, exons, read, original_read,\
                quality, qualityPlifs,settings)
             else:
                trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
 
             dna_calc = dna_calc.replace('-','')
 
+
             #print 'right before computeSpliceWeights exampleIdx %d' % exampleIdx
             # Calculate the weights
             trueWeightDon, trueWeightAcc, trueWeightIntron =\
@@ -297,15 +298,16 @@ class QPalma:
 
             ps = h.convert2SWIG()
 
-            newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
-            newQualityPlifsFeatures, unneeded1, unneeded2 =\
-            self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False,settings)
+            newSpliceAlign, newEstAlign, newWeightMatch, newDPScores, newQualityPlifsFeatures, unused1, unused2 =\
+            performAlignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False,settings)
+
             mm_len = settings['matchmatrixRows']*settings['matchmatrixCols']
 
-            newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],len(dna))
-            newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
+            # check for correct reshaping !!
+            newSpliceAlign = numpy.mat(newSpliceAlign).reshape(num_path[exampleIdx],len(dna))
+            newWeightMatch = numpy.mat(newWeightMatch).reshape(num_path[exampleIdx],mm_len)
 
-            newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],settings['totalQualSuppPoints'])
+            newQualityPlifsFeatures = numpy.mat(newQualityPlifsFeatures).reshape(num_path[exampleIdx],settings['totalQualSuppPoints'])
             # Calculate weights of the respective alignments. Note that we are calculating n-best alignments without 
             # hamming loss, so we have to keep track which of the n-best alignments correspond to the true one in order 
             # not to incorporate a true alignment in the
@@ -339,12 +341,12 @@ class QPalma:
                   distinct_scores = True
 
                # Check wether scalar product + loss equals viterbi score
-               if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
+               if not math.fabs(newDPScores[pathNr] - AlignmentScores[pathNr+1]) <= 1e-5:
                   self.plog("Scalar prod. + loss not equals Viterbi output!\n")
                   pdb.set_trace()
 
                self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
-               self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
+               self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr],AlignmentScores[pathNr+1]))
 
                # if the pathNr-best alignment is very close to the true alignment consider it as true
                if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
@@ -374,11 +376,7 @@ class QPalma:
 
                if False:
                   self.plog("Is considered as: %d\n" % true_map[1])
-
-                  #result_len = currentAlignment.getResultLength()
-
                   dna_array,est_array = currentAlignment.getAlignmentArraysNew()
-
                   _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
                   _newEstAlign = newEstAlign[0].flatten().tolist()[0]
 
@@ -394,8 +392,9 @@ class QPalma:
 
                # end of one example processing 
 
-            # call solver every nth example //added constraint
-            if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
+            # call solver every nth example / added constraint
+            #if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
+            if True:
                objValue,w,self.slacks = solver.solve()
                solver_call_ctr += 1
 
@@ -427,27 +426,25 @@ class QPalma:
                [h,d,a,mmatrix,qualityPlifs] =\
                set_param_palma(param,self.ARGS.train_with_intronlengthinformation,settings)
 
-         ##############################################
-         # end of one iteration through all examples  #
-         ##############################################
-
+         # end of one iteration through all examples
          self.plog("suboptimal rounds %d\n" %suboptimal_example)
 
          if self.noImprovementCtr == numExamples*2:
-            FinalizeTraining(param,'param_%d.pickle'%param_idx)
+            self.FinalizeTraining(param,'param_%d.pickle'%param_idx)
 
          iteration_nr += 1
 
       #
       # end of optimization 
       #  
-      FinalizeTraining(param,'param_%d.pickle'%param_idx)
+      self.FinalizeTraining(param,'param_%d.pickle'%param_idx)
 
 
-   def FinalizeTraining(self,vector,name):
+   def FinalizeTraining(self,param,name):
       self.plog("Training completed")
       cPickle.dump(param,open(name,'w+'))
-      self.logfh.close()
+      if self.logfh != None:
+         self.logfh.close()
    
 
 ###############################################################################
@@ -493,17 +490,17 @@ class QPalma:
       # we do not need the full dataset anymore
       del dataset
    
-      # Load parameter vector to predict with
-      param = cPickle.load(open(settings['prediction_param_fn']))
-
-      self.predict(prediction_set,param,settings)
+      self.predict(prediction_set,settings)
 
 
-   def predict(self,prediction_set,param,settings):
+   def predict(self,prediction_set,settings):
       """
       This method...
       """
 
+      # Load parameter vector to predict with
+      param = cPickle.load(open(settings['prediction_param_fn']))
+
       # Set the parameters such as limits/penalties for the Plifs
       [h,d,a,mmatrix,qualityPlifs] =\
       set_param_palma(param,self.ARGS.train_with_intronlengthinformation,settings)
@@ -541,6 +538,7 @@ class QPalma:
                currentDNASeq, currentAcc, currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
             except:
                self.problem_ctr += 1
+               print sys.exc_info()
                continue
 
             if not settings['enable_splice_scores']:
@@ -594,7 +592,7 @@ class QPalma:
 
       newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
       newQualityPlifsFeatures, dna_array, read_array =\
-      self.do_alignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True,settings)
+      performAlignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True,settings)
 
       mm_len = settings['matchmatrixRows']*settings['matchmatrixCols']