+ minor modifications to allow for parallel prediction of large datasets
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 16 Apr 2008 22:31:23 +0000 (22:31 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 16 Apr 2008 22:31:23 +0000 (22:31 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8556 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/Evaluation.py
scripts/PipelineHeuristic.py
scripts/compile_dataset.py
scripts/qpalma_main.py

index 112e2bb..cdfbec9 100644 (file)
@@ -310,10 +310,13 @@ def perform_prediction(current_dir,run_name):
    This function takes care of starting the jobs needed for the prediction phase
    of qpalma
    """
-   cmd = 'echo /fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s | qsub -l h_vmem=12.0G -cwd -j y -N \"%s.log\"'%(current_dir,run_name)
-   #cmd = './doPrediction.sh %s 1>%s.out 2>%s.err' %(current_dir,run_name,run_name)
-   #print cmd
-   os.system(cmd)
+   for i in range(1,6):
+      cmd = 'echo /fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s %d |\
+      qsub -l h_vmem=12.0G -cwd -j y -N \"%s_%d.log\"'%(current_dir,i,run_name,i)
+
+      #cmd = './doPrediction.sh %s 1>%s.out 2>%s.err' %(current_dir,run_name,run_name)
+      #print cmd
+      os.system(cmd)
 
 
 def forall_experiments(current_func,tl_dir):
@@ -351,9 +354,9 @@ if __name__ == '__main__':
    dir = sys.argv[1]
    assert os.path.exists(dir), 'Error: Directory does not exist!'
 
-   global data
-   data_fn = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_test_new'
-   data = cPickle.load(open(data_fn))
+   #global data
+   #data_fn = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_test_new'
+   #data = cPickle.load(open(data_fn))
 
    forall_experiments(perform_prediction,dir)
    #forall_experiments(collect_prediction,dir)
index 1092c9d..ca04bd3 100644 (file)
@@ -118,7 +118,6 @@ class PipelineHeuristic:
       self.true_neg  = 0
       self.false_neg = 0
 
-
       # total time spend for get seq and scores
       self.get_time  = 0.0
       self.calcAlignmentScoreTime = 0.0
@@ -308,15 +307,15 @@ class PipelineHeuristic:
 
          _currentAcc = currentAcc[:int(exons[1,1])]
 
-         acc_mean = mean([e for e in _currentAcc if e != -inf])
-         factor = 8.5
-         _currentAcc = [acc_mean*factor]*len(_currentAcc) 
+         #acc_mean = mean([e for e in _currentAcc if e != -inf])
+         #factor = 8.5
+         _currentAcc = [0.25]*len(_currentAcc) 
 
          _currentDon = currentDon[:int(exons[1,1])]
 
-         don_mean = mean([e for e in _currentAcc if e != -inf])
-         factor = 2.5
-         _currentDon = [don_mean*factor]*len(_currentDon)
+         #don_mean = mean([e for e in _currentAcc if e != -inf])
+         #factor = 2.5
+         _currentDon = [0.25]*len(_currentDon)
 
          #pdb.set_trace()
 
@@ -381,8 +380,9 @@ if __name__ == '__main__':
 
    run_fname   = jp(dir,'run_object.pickle')
    #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_flag'
-   data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_2k'
-   #data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_100'
+
+   #data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_2k'
+   data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_100'
 
    param_fname = jp(dir,'param_500.pickle')
 
@@ -394,7 +394,8 @@ if __name__ == '__main__':
 
    print 'total time elapsed: %f' % (stop-start)
    print 'time spend for get seq: %f' % ph1.get_time
-   print 'time spend for calcAlignmentScore: %f' %  ph1.calcAlignmentScoreTime
+   print 'time spend for calcAlignmentScoreTime: %f' %  ph1.calcAlignmentScoreTime
+
    #import cProfile
    #cProfile.run('ph1.filter()')
 
index 682b41f..9953faa 100644 (file)
@@ -172,7 +172,7 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
          if instance_counter % 1000 == 0:
             print 'processed %d examples' % instance_counter
 
-         if instance_counter == 100000:
+         if instance_counter == 200000:
             break
 
    print 'Full dataset has size %d' % len(SeqInfo)
index 1b6449c..b7fbf7f 100644 (file)
@@ -667,7 +667,6 @@ class QPalma:
 
       #AlternativeSequences = paths_load_data(data_filename,'training',None,self.ARGS)
 
-
       self.SeqInfo     = SeqInfo
       self.Exons       = Exons
       self.OriginalEsts= OriginalEsts
@@ -677,11 +676,11 @@ class QPalma:
       #calc_info(self.Acceptors,self.Donors,self.Exons,self.Qualities)
       #print 'leaving constructor...'
 
-      self.logfh = open('_qpalma_predict.log','w+')
+      self.logfh = open('_qpalma_predict_%d.log'%run['id'],'w+')
 
       # predict on training set
-      self.plog('##### Prediction on the training set #####\n')
-      self.predict(param_filename,0,beg,'TRAIN')
+      #self.plog('##### Prediction on the training set #####\n')
+      #self.predict(param_filename,0,beg,'TRAIN')
       
       # predict on test set
       self.plog('##### Prediction on the test set #####\n')
@@ -799,6 +798,8 @@ class QPalma:
          current_prediction['start_pos']  = up_cut
          current_prediction['label'] = True
          current_prediction['true_cut'] = true_cut
+         current_prediction['chr'] = chr
+         current_prediction['strand'] = strand
 
          current_example_predictions.append(current_prediction)
 
@@ -829,6 +830,8 @@ class QPalma:
             current_prediction['alternative_start_pos'] = genomicSeq_start
             current_prediction['label'] = currentLabel
             current_prediction['true_cut'] = true_cut
+            current_prediction['chr'] = chr
+            current_prediction['strand'] = strand
 
             current_example_predictions.append(current_prediction)