+ minor changes
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 26 May 2008 07:13:53 +0000 (07:13 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 26 May 2008 07:13:53 +0000 (07:13 +0000)
+ changed data representation a bit
+ moved some code to other modules
+ removed / commented some debugging code lines

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9188 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/Experiment.py
scripts/ModelSelection.py
scripts/QPalmaPipeline.py
scripts/SOAPParser.py
scripts/Utils.py
scripts/compile_dataset.py
scripts/createAlignmentFileFromPrediction.py
scripts/createNewDataset.py
scripts/grid_predict.py
scripts/qpalma_main.py

index dd22238..c414d5a 100644 (file)
@@ -15,6 +15,7 @@ from Run import *
 import pdb
 import os
 import os.path
+import cPickle
 
 def get_dataset(dataset_size,num_nodes):
    all_instances = []
@@ -52,6 +53,19 @@ def get_dataset_splitting_instances(dataset_size,num_nodes):
    return all_instances
 
 
+def get_training_instances():
+   all_instances = []
+
+   params = [\
+   ('enable_quality_scores',True),\
+   ('enable_splice_signals',True),\
+   ('enable_intron_length',True)]
+
+   all_instances.append(params)
+
+   return all_instances
+
+
 def get_scoring_instances():
    all_instances = []
 
@@ -68,45 +82,31 @@ def get_scoring_instances():
 
 
 def createRuns():
-   # specify n for n-fold cross validation
-   numFolds=5
+   # load the configuration object
+   Config = cPickle.load(open(Conf.conf_object_path))
 
    # the main directory where all results are stored
-   experiment_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_single_run'
-
-   assert os.path.exists(experiment_dir), 'toplevel dir for experiment does not exist!'
-
+   alignment_dir   = Config['alignment_dir']
+   
    # list of regularization parameters and additional flags for different runs
    # for example:
    #  - with quality scores
    #  - without quality scores
    #
    bool2str = ['-','+']
-
-   allRuns = []
-
-   dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_full'
-
-   dataset_size = 500000
-   num_nodes = 10
-
    ctr = 1
 
-   #all_instances = get_scoring_instances()
-   all_instances = get_dataset_splitting_instances(dataset_size,num_nodes)
-   #all_instances = get_dataset(dataset_size,num_nodes)
-   #pdb.set_trace()
+   all_instances = get_training_instances()
             
+   allRuns = []
    for parameters in all_instances:
       # create a new Run object
       currentRun = Run()
-      currentName = 'run_'
+      currentName = 'run'
 
       for param_name,param in parameters:
          currentRun[param_name] = param
-         currentName += '%s_%s_' % (str(param_name),str(param))
-         #print param_name,param
-         #print currentName
+         currentName += '_%s_%s' % (str(param_name),str(bool2str[param]))
             
       # global settings for all runs
       currentRun['anzpath']            = Conf.anzpath
@@ -127,7 +127,7 @@ def createRuns():
       currentRun['numQualSuppPoints']     = 10
       currentRun['totalQualSuppPoints']   = currentRun['numQualPlifs']*currentRun['numQualSuppPoints']
 
-      currentRun['enable_quality_scores'] = False
+      currentRun['enable_quality_scores'] = True
       currentRun['enable_splice_signals'] = True
       currentRun['enable_intron_length']  = True
 
@@ -143,48 +143,36 @@ def createRuns():
       + currentRun['totalQualSuppPoints'] 
 
       # run-specific settings
-      currentRun['training_begin']        = Conf.training_begin
-      currentRun['training_end']          = Conf.training_end
-      #currentRun['prediction_begin']      = Conf.prediction_begin
-      #currentRun['prediction_end']        = Conf.prediction_end
-
       currentRun['C']                     = 100
 
       currentRun['name']                  = currentName
-      currentRun['dataset_filename']      = dataset_filename
-      currentRun['experiment_path']       = experiment_dir
+      currentRun['alignment_dir']         = alignment_dir
 
-      currentRun['min_intron_len'] = 20
-      currentRun['max_intron_len'] = 2000
+      currentRun['min_intron_len']        = 20
+      currentRun['max_intron_len']        = 2000
 
-      currentRun['min_svm_score'] = 0.0 
-      currentRun['max_svm_score'] = 1.0
+      currentRun['min_svm_score']         = 0.0 
+      currentRun['max_svm_score']         = 1.0
 
-      currentRun['min_qual'] = -5
-      currentRun['max_qual'] = 40
+      currentRun['min_qual']              = -5
+      currentRun['max_qual']              = 40
 
-      currentRun['dna_flat_files']      = Conf.dna_flat_fn
+      currentRun['dna_flat_files']        = Conf.dna_flat_fn
 
       currentRun['id']      = ctr
       ctr += 1
 
       allRuns.append(currentRun)
 
-
 ###############################################################################
 #
 # check for valid paths / options etc
 #
 ###############################################################################
 
-
    for currentRun in allRuns:
 
       assert 0 < currentRun['anzpath'] < 100
-      assert 0 <= currentRun['training_begin'] < currentRun['training_end']
-      assert currentRun['training_begin'] < currentRun['training_end'] 
-      assert currentRun['prediction_begin'] < currentRun['prediction_end']
-
       assert currentRun['iter_steps']
 
       #assert currentRun['matchmatrixCols']
@@ -216,8 +204,7 @@ def createRuns():
       assert currentRun['enable_intron_length']  in [True,False]
 
       #assert currentRun['totalQualSuppPoints']
-      assert os.path.exists(currentRun['dataset_filename'])
-      assert os.path.exists(currentRun['experiment_path'])
+      assert os.path.exists(currentRun['alignment_dir'])
 
    return allRuns
 
index 81941f4..ebc2baf 100644 (file)
@@ -22,9 +22,9 @@ class Model:
 
       allRuns = Exp.createRuns()
       
-      pdb.set_trace()
+      #pdb.set_trace()
       
-      for currentRun in allRuns[9:]:
+      for currentRun in allRuns:
                
          currentInstance = QPalma(currentRun)
 
@@ -66,5 +66,6 @@ class Model:
 if __name__ == '__main__':
    m = Model()
    m.createInstances()
-   #m.doSelection()
-   m.doPrediction()
+   m.doSelection()
+   #m.doPrediction()
+
index 46bb66b..58758b4 100644 (file)
@@ -16,7 +16,7 @@ import glob
 import time
 
 
-def get_most_recent_param(current_dir):                                                                                                                                                                                        
+def get_most_recent_param(current_dir):
    date_file_list = []
    for file in glob.glob(current_dir + '/param*'):
       stats = os.stat(file)
@@ -77,9 +77,9 @@ class FirstStep(PipelineStep):
       # ATTENTION: Changing working directory
       os.chdir(Config['mapping_spliced_dir'])
 
-      cmd = 'filter_quality.pl 8 reads_0.fl && mv reads_0.fl.filtered reads_0.fl'
-
-      cL.append(cmd)
+      # ATTENTION: Whether we should use the filtering step is not clear
+      #cmd = 'filter_quality.pl 8 reads_0.fl && mv reads_0.fl.filtered reads_0.fl'
+      #cL.append(cmd)
 
       cmd = 'map_spliced_transcript.pl --mismatches=%d --sub_mismatches=%d --read_length=%d --min_short_end=%d --repeat_mapping=%d --seedlength=%d --dir=. --suffixtree=%s' % (\
       Config['mismatches_2'],\
@@ -134,5 +134,6 @@ class FourthStep(PipelineStep):
    
    """
 
+
 if __name__ == '__main__':
    step1 = FirstStep()
index 8059b65..edbffbd 100644 (file)
@@ -6,7 +6,6 @@ import os.path
 import pdb
 
 """
-
 This explanation was take from the SOAP website ( http://soap.genomics.org.cn/ )
 
 One line for One hit. The columns are separated by '\t'.
index c17fee9..5117d85 100644 (file)
@@ -6,6 +6,11 @@ from numpy.matlib import mat,zeros,ones,inf
 import palma.palma_utils as pu
 from palma.output_formating import print_results
 
+# some useful constants
+
+extended_alphabet = ['-','a','c','g','t','n','[',']']
+alphabet          = ['-','a','c','g','t','n']
+
 def calc_info(acc,don,exons,qualities):
    min_score = 100
    max_score = -100
index 88d386d..a125664 100644 (file)
@@ -48,20 +48,21 @@ class DatasetGenerator:
    def saveAs(self,dataset_file):
       assert not os.path.exists(dataset_file), 'The data_file already exists!'
 
-      all_keys = self.training_set.keys()
-      random.shuffle(all_keys)
-      training_keys = all_keys[0:10000]
-
       # saving new datasets
+
+      #all_keys = self.training_set.keys()
+      #random.shuffle(all_keys)
+      #training_keys = all_keys[0:10000]
       #cPickle.dump(self.training_set,open('%s.train.pickle'%dataset_file,'w+'),protocol=2)
       #cPickle.dump(training_keys,open('%s.train_keys.pickle'%dataset_file,'w+'),protocol=2)
+
       cPickle.dump(self.testing_set,open('%s.test.pickle'%dataset_file,'w+'),protocol=2)
 
       prediction_keys = [0]*len(self.testing_set.keys())
       for pos,key in enumerate(self.testing_set.keys()):
          prediction_keys[pos] = key
 
-      cPickle.dump(self.prediction_keys,open('%s.test_keys.pickle'%dataset_file,'w+'),protocol=2)
+      cPickle.dump(prediction_keys,open('%s.test_keys.pickle'%dataset_file,'w+'),protocol=2)
 
 
    def compile_training_set(self):
@@ -122,7 +123,7 @@ class DatasetGenerator:
       self.training_set = dataset
       
 
-   def parse_map_file(self,dataset,map_file)
+   def parse_map_file(self,dataset,map_file):
       strand_map = ['-','+']
       instance_counter = 1
 
@@ -138,6 +139,12 @@ class DatasetGenerator:
          strand   = slist[3]
          strand   = strand_map[strand == 'D']
 
+         if strand != '+':
+            continue
+
+         if not chromo in range(1,6):
+            continue
+
          genomicSeq_start = pos - 1500
          genomicSeq_stop  = pos + 1500
 
@@ -155,10 +162,19 @@ class DatasetGenerator:
          currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
          currentQualities = (prb,cal_prb,chastity)
 
-         dataset[id] = (currentSeqInfo,original_est,currentQualities)
+         # as one single read can have several vmatch matches we store all
+         # these matches under the unique id of the read
+         if dataset.has_key(id):
+            old_entry = dataset[id]
+            old_entry.append((currentSeqInfo,original_est,currentQualities))
+            dataset[id] = old_entry
+         else:
+            dataset[id] = [(currentSeqInfo,original_est,currentQualities)]
 
          instance_counter += 1
 
+      print 'Total examples processed: %d' % instance_counter
+
       return dataset
 
 
index add55ec..cc37313 100644 (file)
@@ -9,20 +9,27 @@ import os.path
 import math
 from qpalma.parsers import *
 
-def prediction_on(filename,filtered_reads,out_fname):
+from Evaluation import load_chunks
+
+def prediction_on(current_dir,filtered_reads,out_fname):
     
-   print 'parsing filtered reads..'
-   all_filtered_reads = parse_filtered_reads(filtered_reads)
-   print 'found %d filtered reads' % len(all_filtered_reads)
+   #print 'parsing filtered reads..'
+   #all_filtered_reads = parse_filtered_reads(filtered_reads)
+   #print 'found %d filtered reads' % len(all_filtered_reads)
+
+   import qparser
+   qparser.parse_reads(filtered_reads)
 
    out_fh = open(out_fname,'w+')
 
-   allPredictions = cPickle.load(open(filename))
+   allPredictions = load_chunks(current_dir)
    allPositions = {}
 
    for current_prediction in allPredictions:
       id = current_prediction['id']
-      current_ground_truth = all_filtered_reads[id]
+
+      #current_ground_truth = all_filtered_reads[id]
+      current_ground_truth = qparser.fetch_read(id)
 
       true_cut = current_ground_truth['true_cut']
       seq = current_ground_truth['seq']
@@ -35,26 +42,44 @@ def prediction_on(filename,filtered_reads,out_fname):
       alignment   = current_prediction['alignment']
 
       predExons = current_prediction['predExons']
+      predExons = [e+start_pos for e in predExons]
+
+      p_start  = current_ground_truth['p_start']
+      e_stop   = current_ground_truth['exon_stop']
+      e_start  = current_ground_truth['exon_start']
+      p_stop   = current_ground_truth['p_stop']
+      cut_pos = current_ground_truth['true_cut']
+
+      correct_prediction = False
+
+      #if len(predExons) == 4:
+      #   spliced_flag = True
+      #   predExons[1] -= 1
+      #   predExons[3] -= 1
 
-      if len(predExons) == 4:
-         p1 = predExons[0]
-         p2 = predExons[1]
-         p3 = predExons[2]
-         p4 = predExons[3]
+      #   if p_start == predExons[0] and e_stop == predExons[1] and\
+      #   e_start == predExons[2] and p_stop == predExons[3]:
+      #      correct_prediction = True
 
-      elif len(predExons) == 2:
-         p1 = predExons[0]
-         p2 = -1
-         p3 = -1
-         p4 = predExons[1]
+      #   if p_start == predExons[0] and e_stop == predExons[1] and\
+      #   e_start == predExons[2] and p_stop+1 == predExons[3]:
+      #      print 'special case'
 
-      #allPositions[exampleId] = (chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment)
+      #elif len(predExons) == 2:
+      #   spliced_flag = False
+      #   predExons[1] -= 1
+
+      #   if math.fabs(p_start - predExons[0]) <= 0 and math.fabs(p_stop - predExons[1]) <= 2:
+      #      correct_prediction = True
+      #      
+      #else:
+      #   pass
 
       (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
       tExonSizes,tStarts, tEnds) = alignment 
 
       new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
-      (id,chr,strand,seq,q1,start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+      (id,chr,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
       str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
       str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
       str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
@@ -64,62 +89,61 @@ def prediction_on(filename,filtered_reads,out_fname):
    return allPositions
 
 
-def writePredictions(fname,allPositions):
-
-
-   allEntries = {}
-
-   for line in open('/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.pipeline'):
-      line = line.strip()
-      id,seq,q1,q2,q3 = line.split()
-      id = int(id)
-      
-      allEntries[id] = (seq,q1,q2,q3)
-
-
-   for id,elems in allPositions.items():
-      seq,q1,q2,q3 = allEntries[id]
-      chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment = elems
-
-      p1 += start_pos
-      p2 += start_pos
-      p3 += start_pos
-      p4 += start_pos
-
-
-
-   out_fh.close()
-
-def collect_prediction(current_dir):
-   """
-   Given the toplevel directory this function takes care that for each distinct
-   experiment the training and test predictions are evaluated.
-
-   """
-   train_suffix   = '_allPredictions_TRAIN'
-   test_suffix    = '_allPredictions_TEST'
-
-   run_name = 'run_+_quality_+_splicesignals_+_intron_len_1'
-
-   jp = os.path.join
-   b2s = ['-','+']
-
-   #currentRun = cPickle.load(open(jp(current_dir,'run_object_1.pickle')))
-   #QFlag    = currentRun['enable_quality_scores']
-   #SSFlag   = currentRun['enable_splice_signals']
-   #ILFlag   = currentRun['enable_intron_length']
-   #currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
-   
-   filename =  jp(current_dir,run_name)+test_suffix
-   print 'Prediction on: %s' % filename
-   test_result = prediction_on(filename)
-
-   fname = 'predictions.txt'
-   writePredictions(fname,test_result)
-
+#def writePredictions(fname,allPositions):
+#
+#
+#   allEntries = {}
+#
+#   for line in open('/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.pipeline'):
+#      line = line.strip()
+#      id,seq,q1,q2,q3 = line.split()
+#      id = int(id)
+#      
+#      allEntries[id] = (seq,q1,q2,q3)
+#
+#
+#   for id,elems in allPositions.items():
+#      seq,q1,q2,q3 = allEntries[id]
+#      chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment = elems
+#
+#      p1 += start_pos
+#      p2 += start_pos
+#      p3 += start_pos
+#      p4 += start_pos
+#
+#   out_fh.close()
+#
+
+#def collect_prediction(current_dir):
+#   """
+#   Given the toplevel directory this function takes care that for each distinct
+#   experiment the training and test predictions are evaluated.
+#
+#   """
+#   train_suffix   = '_allPredictions_TRAIN'
+#   test_suffix    = '_allPredictions_TEST'
+#
+#   run_name = 'run_+_quality_+_splicesignals_+_intron_len_1'
+#
+#   jp = os.path.join
+#   b2s = ['-','+']
+#
+#   #currentRun = cPickle.load(open(jp(current_dir,'run_object_1.pickle')))
+#   #QFlag    = currentRun['enable_quality_scores']
+#   #SSFlag   = currentRun['enable_splice_signals']
+#   #ILFlag   = currentRun['enable_intron_length']
+#   #currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
+#   
+#   filename =  jp(current_dir,run_name)+test_suffix
+#   print 'Prediction on: %s' % filename
+#   test_result = prediction_on(filename)
+#
+#   fname = 'predictions.txt'
+#   writePredictions(fname,test_result)
+#
 
 if __name__ == '__main__':
-   filename = sys.argv[1]
+   current_dir = sys.argv[1]
    filtered_reads = sys.argv[2]
    out_fh = sys.argv[3]
-   prediction_on(filename,filtered_reads,out_fh)
+   prediction_on(current_dir,filtered_reads,out_fh)
index a8a761b..fa64382 100644 (file)
@@ -4,16 +4,12 @@
 import qpalma.Configuration as Conf
 from compile_dataset import DatasetGenerator
 
-#filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/allReads.full.newid'
-#remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/map_2nd.vm_chr'
+filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full'
 
-filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/allReads.test'
-remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/map.test'
+map_1_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/spliced.heuristic'
+map_2_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/map_2nd.vm'
 
-#compile_d2(Conf.dna_flat_fn,filtered_fn,remapped_fn,'dataset_2nd')
-#compile_dataset_direct(filtered_fn,'dataset_all_spliced_reads_2.pickle')
-
-dg = DatasetGenerator(filtered_fn,remapped_fn)
-dg.compile_training_set()
+dg = DatasetGenerator(filtered_fn,map_1_fn,map_2_fn)
+#dg.compile_training_set()
 dg.compile_testing_set()
-dg.saveAs('dataset_11_05')
+dg.saveAs('dataset_19_05_08')
index 50657c9..54ed50a 100644 (file)
@@ -43,10 +43,10 @@ def makeJobs(run,dataset_fn,chunks,param):
 
    jobs=[]
 
-   for c_name,current_chunk in chunks[:1]:
+   for c_name,current_chunk in chunks:
       current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param,c_name])
       current_job.h_vmem = '5.0G'
-      current_job.express = 'True'
+      #current_job.express = 'True'
 
       print "job #1: ", current_job.nativeSpecification
 
@@ -74,7 +74,7 @@ def create_and_submit():
 
    print 'Found %d keys for prediction.' % len(prediction_keys)
 
-   num_splits = 12
+   num_splits = 15
    slices = get_slices(len(prediction_keys),num_splits)
    chunks = []
    for idx,slice in enumerate(slices):
index 8af76e2..4ce1a39 100644 (file)
@@ -713,7 +713,10 @@ class QPalma:
       # end of the prediction loop we save all predictions in a pickle file and exit
       cPickle.dump(allPredictions,open('%s.predictions.pickle'%(set_name),'w+'))
       print 'Prediction completed'
-      print 'Problem ctr %d' % problem_ctr
+      self.plog('Prediction completed\n')
+      mes =  'Problem ctr %d' % problem_ctr
+      print mes
+      self.plog(mes+'\n')
       self.logfh.close()