+ added faster evaluation function
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Sat, 19 Apr 2008 15:35:52 +0000 (15:35 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Sat, 19 Apr 2008 15:35:52 +0000 (15:35 +0000)
+ in PipelineHeuristic fixed bug in returning alternatives
+ corrected if statements in Lookup

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8700 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/Evaluation.py
scripts/Experiment.py
scripts/Lookup.py
scripts/ModelSelection.py
scripts/PipelineHeuristic.py
scripts/compile_dataset.py
scripts/createAlignmentFileFromPrediction.py
scripts/createNewDataset.py

index af9bccb..d0d9e7d 100644 (file)
@@ -357,6 +357,8 @@ def predict_on(filename,filtered_reads):
 
    total_vmatch_instances_ctr = 0
 
+   cut_pos_ctr = {}
+
 
    for current_prediction in allPredictions:
       id = current_prediction['id']
@@ -390,6 +392,11 @@ def predict_on(filename,filtered_reads):
          else:
             pos_incorrect_ctr += 1
             #pdb.set_trace()
+            try:
+               cut_pos_ctr[cut_pos] += 1
+            except:
+               cut_pos_ctr[cut_pos] = 1
+
 
       elif len(predExons) == 2:
          predExons[1] -= 1
@@ -422,6 +429,7 @@ def predict_on(filename,filtered_reads):
    print 'Total num. of examples: %d' % numPredictions
    print 'Correct pos: %2.3f, incorrect pos: %2.3f' %\
    (pos_correct_ctr/(1.0*numPredictions),pos_incorrect_ctr/(1.0*numPredictions))
+   print cut_pos_ctr
 
    #print 'Correct scores: %d, incorrect scores: %d' %\
    #(score_correct_ctr,score_incorrect_ctr)
index aa95e2a..4d2d4ad 100644 (file)
@@ -16,12 +16,51 @@ import pdb
 import os
 import os.path
 
+def get_dataset_splitting_instances(dataset_size,num_nodes):
+   all_instances = []
+
+   part = dataset_size / num_nodes
+   begin = 0
+   end = 0
+   for idx in range(1,num_nodes+1):
+      
+      if idx == num_nodes:
+         begin = end
+         end   = dataset_size
+      else:
+         begin = end
+         end = begin+part
+
+      params = [\
+      ('prediction_begin',begin),\
+      ('prediction_end',end)]
+
+      all_instances.append(params)
+
+   return all_instances
+
+
+def get_scoring_instances():
+   all_instances = []
+
+   for QFlag in [True,False]:
+      for SSFlag in [True,False]:
+         for ILFlag in [True,False]:
+            params = [\
+            ('enable_quality_scores',QFlag),\
+            ('enable_splice_signals',SSFlag),\
+            ('enable_intron_length',ILFlag)]
+            all_instances.append(params)
+
+   return all_instances
+
+
 def createRuns():
    # specify n for n-fold cross validation
    numFolds=5
 
    # the main directory where all results are stored
-   experiment_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalmaTraining'
+   experiment_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_single_run'
 
    assert os.path.exists(experiment_dir), 'toplevel dir for experiment does not exist!'
 
@@ -34,89 +73,103 @@ def createRuns():
 
    allRuns = []
 
-   dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_test_new'
+   dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_2nd'
 
-   for QFlag in [True,False]:
-      for SSFlag in [True,False]:
-         for ILFlag in [True,False]:
+   dataset_size = 400000
+   num_nodes = 10
+
+   ctr = 1
+
+   #all_instances = get_scoring_instances()
+   all_instances = get_dataset_splitting_instances(dataset_size,num_nodes)
             
-            # create a new Run object
-            currentRun = Run()
+   for parameters in all_instances:
+      # create a new Run object
+      currentRun = Run()
+      currentName = 'run_'
+
+      for param_name,param in parameters:
+         currentRun[param_name] = param
+         currentName += '%s_%s_' % (str(param_name),str(param))
+         print param_name,param
+         print currentName
             
-            # global settings for all runs
-            currentRun['anzpath']            = Conf.anzpath
-            currentRun['iter_steps']         = Conf.iter_steps
-            currentRun['matchmatrixRows']    = Conf.sizeMatchmatrix[0]
-            currentRun['matchmatrixCols']    = Conf.sizeMatchmatrix[1]
-            currentRun['mode']               = Conf.mode
-            currentRun['numConstraintsPerRound'] = Conf.numConstraintsPerRound 
+      # global settings for all runs
+      currentRun['anzpath']            = Conf.anzpath
+      currentRun['iter_steps']         = Conf.iter_steps
+      currentRun['matchmatrixRows']    = Conf.sizeMatchmatrix[0]
+      currentRun['matchmatrixCols']    = Conf.sizeMatchmatrix[1]
+      currentRun['mode']               = Conf.mode
+      currentRun['numConstraintsPerRound'] = Conf.numConstraintsPerRound 
 
-            currentRun['remove_duplicate_scores']  = Conf.remove_duplicate_scores
-            currentRun['print_matrix']          = Conf.print_matrix
-            currentRun['read_size']             = Conf.read_size
+      currentRun['remove_duplicate_scores']  = Conf.remove_duplicate_scores
+      currentRun['print_matrix']          = Conf.print_matrix
+      currentRun['read_size']             = Conf.read_size
 
-         
-            currentRun['numLengthSuppPoints']   = 10 #Conf.numLengthSuppPoints
+      currentRun['numDonSuppPoints']      = 10
+      currentRun['numAccSuppPoints']      = 10
 
-            # if we are not using an intron length model at all we do not need the support points
-            if ILFlag == False:
-               currentRun['numLengthSuppPoints']   = 2 #Conf.numLengthSuppPoints
+      currentRun['numQualPlifs']          = Conf.numQualPlifs
+      currentRun['numQualSuppPoints']     = 10
+      currentRun['totalQualSuppPoints']   = currentRun['numQualPlifs']*currentRun['numQualSuppPoints']
 
-            currentRun['numDonSuppPoints']      = 10
-            currentRun['numAccSuppPoints']      = 10
+      currentRun['enable_quality_scores'] = True
+      currentRun['enable_splice_signals'] = True
+      currentRun['enable_intron_length']  = True
 
-            currentRun['numQualPlifs']          = Conf.numQualPlifs
-            currentRun['numQualSuppPoints']     = 10
-            currentRun['totalQualSuppPoints']   = currentRun['numQualPlifs']*currentRun['numQualSuppPoints']
+      # if we are not using an intron length model at all we do not need the support points
+      currentRun['numLengthSuppPoints']   = 10 #Conf.numLengthSuppPoints
 
-            currentRun['numFeatures']           = currentRun['numLengthSuppPoints']\
-            + currentRun['numDonSuppPoints'] + currentRun['numAccSuppPoints']\
-            + currentRun['matchmatrixRows'] * currentRun['matchmatrixCols']\
-            + currentRun['totalQualSuppPoints'] 
+      if currentRun['enable_intron_length'] == False:
+         currentRun['numLengthSuppPoints']   = 2 #Conf.numLengthSuppPoints
 
-            # run-specific settings
-            currentRun['training_begin']         = Conf.training_begin
-            currentRun['training_end']           = Conf.training_end
-            currentRun['prediction_begin']       = Conf.prediction_begin
-            currentRun['prediction_end']         = Conf.prediction_end
+      currentRun['numFeatures']           = currentRun['numLengthSuppPoints']\
+      + currentRun['numDonSuppPoints'] + currentRun['numAccSuppPoints']\
+      + currentRun['matchmatrixRows'] * currentRun['matchmatrixCols']\
+      + currentRun['totalQualSuppPoints'] 
 
-            currentRun['enable_quality_scores'] = QFlag
-            currentRun['enable_splice_signals'] = SSFlag
-            currentRun['enable_intron_length']  = ILFlag
+      # run-specific settings
+      currentRun['training_begin']        = Conf.training_begin
+      currentRun['training_end']          = Conf.training_end
+      #currentRun['prediction_begin']      = Conf.prediction_begin
+      #currentRun['prediction_end']        = Conf.prediction_end
 
-            currentName = 'run_%s_quality_%s_splicesignals_%s_intron_len' %\
-            (bool2str[QFlag],bool2str[SSFlag],bool2str[ILFlag])
+      currentRun['C']                     = 100
 
-            currentRun['C']                     = 100
+      currentRun['name']                  = currentName
+      currentRun['dataset_filename']      = dataset_filename
+      currentRun['experiment_path']       = experiment_dir
 
-            currentRun['name']                  = currentName
-            currentRun['dataset_filename']      = dataset_filename
-            currentRun['experiment_path']       = experiment_dir
+      currentRun['min_intron_len'] = 20
+      currentRun['max_intron_len'] = 2000
 
-            currentRun['min_intron_len'] = 20
-            currentRun['max_intron_len'] = 2000
+      currentRun['min_svm_score'] = 0.0 
+      currentRun['max_svm_score'] = 1.0
 
-            #currentRun['min_intron_len'] = 10
-            #currentRun['max_intron_len'] = 100
+      currentRun['min_qual'] = -5
+      currentRun['max_qual'] = 40
 
-            currentRun['min_svm_score'] = 0.0 
-            currentRun['max_svm_score'] = 1.0
+      currentRun['dna_flat_files']      = Conf.dna_flat_fn
 
-            currentRun['min_qual'] = -5
-            currentRun['max_qual'] = 40
+      currentRun['id']      = ctr
+      ctr += 1
 
-            currentRun['dna_flat_files']      = Conf.dna_flat_fn
+      allRuns.append(currentRun)
+
+
+###############################################################################
+#
+# check for valid paths / options etc
+#
+###############################################################################
 
-            allRuns.append(currentRun)
 
-   #
-   # check for valid paths / options etc
-   #
    for currentRun in allRuns:
 
       assert 0 < currentRun['anzpath'] < 100
       assert 0 <= currentRun['training_begin'] < currentRun['training_end']
-      assert currentRun['training_end'] <= currentRun['prediction_begin'] < currentRun['prediction_end']
+      assert currentRun['training_begin'] < currentRun['training_end'] 
+      assert currentRun['prediction_begin'] < currentRun['prediction_end']
 
       assert currentRun['iter_steps']
 
@@ -156,4 +209,4 @@ def createRuns():
 
 if __name__ == '__main__':
    allRuns = createRuns()
-   pdb.set_trace()
+   #pdb.set_trace()
index a0673ab..52986b7 100644 (file)
@@ -14,7 +14,7 @@ from Genefinding import *
 from genome_utils import load_genomic
 
 # only for checking purposes
-from compile_dataset import  get_seq_and_scores
+#from compile_dataset import  get_seq_and_scores
 
 
 class Lookup:
@@ -30,9 +30,7 @@ class Lookup:
       self.max_size = 40000000
 
       self.chromosomes = range(1,6)
-      #self.chromosomes = [1]
-
-      strands = ['+','-']
+      self.strands = ['+']
 
       self.genomicSequencesP   = [0]*(len(self.chromosomes)+1)
       self.acceptorScoresP     = [0]*(len(self.chromosomes)+1)
@@ -43,31 +41,34 @@ class Lookup:
       self.donorScoresN        = [0]*(len(self.chromosomes)+1)
      
       for chrId in self.chromosomes:
-         for strandId in strands:
+         for strandId in self.strands:
             print (chrId,strandId)
             self.prefetch_seq_and_scores(chrId,strandId)
 
  
    def get_seq_and_scores(self,chr,strand,start,stop,dna_flat_files):
       assert chr in self.chromosomes
+      assert strand in self.strands
 
-      if strand == '+'
+      if strand == '+':
          currentSequence         = self.genomicSequencesP[chr][start-1:stop]
          length = len(currentSequence)
-         currentDonorScores      = self.donorScoresP[chr][start:stop]
-         currentDonorScores      += [-inf]*(length-len(currentDonorScores))
-         currentAcceptorScores   = self.acceptorScoresP[chr][start:stop+2]
-         currentAcceptorScores   = currentAcceptorScoresP[1:]
-      elif:
+         currentDonorScores     = self.donorScoresP[chr][start:stop]
+         currentDonorScores     += [-inf]*(length-len(currentDonorScores))
+         currentAcceptorScores  = self.acceptorScoresP[chr][start:stop+2]
+         currentAcceptorScores  = currentAcceptorScores[1:]
+      elif strand == '-':
          currentSequence         = self.genomicSequencesN[chr][start-1:stop]
          length = len(currentSequence)
-         currentDonorScores      = self.donorScoresN[chr][start:stop]
-         currentDonorScores      += [-inf]*(length-len(currentDonorScores))
-         currentAcceptorScores   = self.acceptorScoresN[chr][start:stop+2]
-         currentAcceptorScores   = currentAcceptorScoresN[1:]
+         currentDonorScores     = self.donorScoresN[chr][start:stop]
+         currentDonorScores     += [-inf]*(length-len(currentDonorScores))
+         currentAcceptorScores  = self.acceptorScoresN[chr][start:stop+2]
+         currentAcceptorScores  = currentAcceptorScores[1:]
+      else:
+         pdb.set_trace()
 
 
-      return currentSequence, currentAcceptorScores, currentDonorScores
+      return currentSequence.lower(), currentAcceptorScores, currentDonorScores
 
 
    def prefetch_seq_and_scores(self,chr,strand):
@@ -93,12 +94,12 @@ class Lookup:
 
       currentAcc, currentDon = self.getSpliceScores(chr,strand,intervalBegin,intervalEnd)
 
-      if strand == '+'
+      if strand == '+':
          self.genomicSequencesP[chr] = genomicSeq
          self.acceptorScoresP[chr]   = currentAcc
          self.donorScoresP[chr]      = currentDon
 
-      elif:
+      elif strand == '-':
          self.genomicSequencesN[chr] = genomicSeq
          self.acceptorScoresN[chr]   = currentAcc
          self.donorScoresN[chr]      = currentDon
index af196e3..28b59f6 100644 (file)
@@ -5,6 +5,7 @@ import cPickle
 import os
 import os.path
 import time
+import subprocess
 
 from qpalma_main import QPalma
 import Experiment as Exp
@@ -20,7 +21,7 @@ class Model:
 
       allRuns = Exp.createRuns()
       
-      for currentRun in allRuns:
+      for currentRun in allRuns[0:1]:
                
          currentInstance = QPalma(currentRun)
 
@@ -33,6 +34,7 @@ class Model:
 
          self.allInstances.append(instance_fn)
 
+
    def doSelection(self):
       for instance in self.allInstances:
          time.sleep(2)
@@ -43,7 +45,23 @@ class Model:
          #os.system('echo "./resurrect %s >out_%s.log 2>err_%s.log &"'%(instance,instance,instance))
 
 
+   def doPrediction(self):
+      current_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_single_run'
+
+      for instance in self.allInstances:
+         time.sleep(2)
+         cmd = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s 1>_%s.log 2>%s.log & '%(instance,instance,instance)
+         #cmd = 'echo /fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s |\
+         #qsub -l h_vmem=12.0G -cwd -j y -N \"%s.log\"'%(instance,instance)
+         #os.system(cmd) 
+         obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+         #out,err = obj.communicate()
+         #print out,err
+
+
+
 if __name__ == '__main__':
    m = Model()
    m.createInstances()
-   m.doSelection()
+   #m.doSelection()
+   m.doPrediction()
index 8e8b6d5..175e18d 100644 (file)
@@ -67,7 +67,7 @@ class PipelineHeuristic:
       dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
 
       start = cpu()
-      self.all_remapped_reads = parse(data_fname)
+      self.all_remapped_reads = parse_map_vm_heuristic(data_fname)
       stop = cpu()
 
       print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
@@ -247,7 +247,11 @@ class PipelineHeuristic:
          currentVMatchAlignment = dna, exons, est, original_est, quality,\
          currentAcc, currentDon
 
-         alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+         try:
+            alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+         except:
+            alternativeAlignmentScores = []
+
 
          if alternativeAlignmentScores == []:
              # no alignment necessary
@@ -384,7 +388,7 @@ class PipelineHeuristic:
       stop = cpu()
       self.get_time += stop-start
       dna   = currentDNASeq
-
+   
       proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
        
       alternativeScores = []
index 0be5ec0..46e5434 100644 (file)
@@ -109,14 +109,8 @@ def compile_d2(gff_file,dna_flat_files,filtered_reads,remapped_reads,dataset_fil
             if instance_counter % 1000 == 0:
                print 'processed %d examples' % instance_counter
 
-            #if instance_counter == 400000:
-            #   break
-
             if reRead['strand'] != '+':
                continue
-   
-            #seq_info, exons, cut_pos =\
-            #   process_filtered_read(currentFRead,dna_flat_files)
          
             (id,chr,strand,genomicSeq_start,genomicSeq_stop) =\
             process_map(reRead,currentFRead,dna_flat_files)
index 78f7ebe..add55ec 100644 (file)
@@ -3,54 +3,69 @@
 
 import cPickle
 import sys
-import pydb
 import pdb
 import os
 import os.path
 import math
+from qpalma.parsers import *
 
-def prediction_on(filename):
+def prediction_on(filename,filtered_reads,out_fname):
     
-   allPredictions = cPickle.load(open(filename))
+   print 'parsing filtered reads..'
+   all_filtered_reads = parse_filtered_reads(filtered_reads)
+   print 'found %d filtered reads' % len(all_filtered_reads)
+
+   out_fh = open(out_fname,'w+')
 
+   allPredictions = cPickle.load(open(filename))
    allPositions = {}
 
+   for current_prediction in allPredictions:
+      id = current_prediction['id']
+      current_ground_truth = all_filtered_reads[id]
 
-   for current_example_pred in allPredictions:
-      gt_example = current_example_pred[0]
+      true_cut = current_ground_truth['true_cut']
+      seq = current_ground_truth['seq']
+      q1 = current_ground_truth['prb']
 
-      exampleId = gt_example['id']
+      chr         = current_prediction['chr']
+      strand      = current_prediction['strand']
+      #true_cut    = current_prediction['true_cut']
+      start_pos   = current_prediction['start_pos']
+      alignment   = current_prediction['alignment']
 
-      for elem_nr,current_pred in enumerate(current_example_pred[1:]):
-         exampleId   = current_pred['id']
-         chr         = current_pred['chr']
-         strand      = current_pred['strand']
-         true_cut    = current_pred['true_cut']
-         start_pos   = current_pred['alternative_start_pos']
-         alignment   = current_pred['alignment']
+      predExons = current_prediction['predExons']
 
-         predExons = current_pred['predExons']
-         trueExons = current_pred['trueExons']
-         predPositions = [elem + current_pred['alternative_start_pos'] for elem in predExons]
-         truePositions = [elem + current_pred['start_pos'] for elem in trueExons.flatten().tolist()[0]]
+      if len(predExons) == 4:
+         p1 = predExons[0]
+         p2 = predExons[1]
+         p3 = predExons[2]
+         p4 = predExons[3]
 
-         if len(predExons) == 4:
-            p1 = predExons[0]
-            p2 = predExons[1]
-            p3 = predExons[2]
-            p4 = predExons[3]
+      elif len(predExons) == 2:
+         p1 = predExons[0]
+         p2 = -1
+         p3 = -1
+         p4 = predExons[1]
 
-            #line = '%d\t%d\t%d\t%d\t%d\n' % (exampleId,p1,p2,p3,p4)
-            #print line
-            allPositions[exampleId] = (chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment)
+      #allPositions[exampleId] = (chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment)
 
+      (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
+      tExonSizes,tStarts, tEnds) = alignment 
+
+      new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
+      (id,chr,strand,seq,q1,start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+      str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
+      str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
+      str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
+
+      out_fh.write(new_line)
 
    return allPositions
 
 
 def writePredictions(fname,allPositions):
 
-   out_fh = open(fname,'w+')
 
    allEntries = {}
 
@@ -63,7 +78,6 @@ def writePredictions(fname,allPositions):
 
 
    for id,elems in allPositions.items():
-      id += 1000000000000
       seq,q1,q2,q3 = allEntries[id]
       chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment = elems
 
@@ -72,18 +86,7 @@ def writePredictions(fname,allPositions):
       p3 += start_pos
       p4 += start_pos
 
-      #pdb.set_trace()
 
-      (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
-      tExonSizes,tStarts, tEnds) = alignment 
-
-      new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
-      (id,chr,strand,seq,q1,start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
-      str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
-      str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
-      str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
-
-      out_fh.write(new_line)
 
    out_fh.close()
 
@@ -101,16 +104,12 @@ def collect_prediction(current_dir):
    jp = os.path.join
    b2s = ['-','+']
 
-   currentRun = cPickle.load(open(jp(current_dir,'run_object_1.pickle')))
-   QFlag    = currentRun['enable_quality_scores']
-   SSFlag   = currentRun['enable_splice_signals']
-   ILFlag   = currentRun['enable_intron_length']
-   currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
+   #currentRun = cPickle.load(open(jp(current_dir,'run_object_1.pickle')))
+   #QFlag    = currentRun['enable_quality_scores']
+   #SSFlag   = currentRun['enable_splice_signals']
+   #ILFlag   = currentRun['enable_intron_length']
+   #currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
    
-   #filename =  jp(current_dir,run_name)+train_suffix
-   #print 'Prediction on: %s' % filename
-   #prediction_on(filename)
-
    filename =  jp(current_dir,run_name)+test_suffix
    print 'Prediction on: %s' % filename
    test_result = prediction_on(filename)
@@ -118,10 +117,9 @@ def collect_prediction(current_dir):
    fname = 'predictions.txt'
    writePredictions(fname,test_result)
 
-   #return train_result,test_result,currentRunId
-
 
 if __name__ == '__main__':
-   dir = sys.argv[1]
-   #run_name = sys.argv[2]
-   collect_prediction(dir)
+   filename = sys.argv[1]
+   filtered_reads = sys.argv[2]
+   out_fh = sys.argv[3]
+   prediction_on(filename,filtered_reads,out_fh)
index 5ae8f08..87aa533 100644 (file)
@@ -2,12 +2,13 @@
 # -*- coding: utf-8 -*-
 
 import qpalma.Configuration as Conf
-from compile_dataset import compile_d
+from compile_dataset import compile_d2
 
+#filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.full'
+#remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/map.vm_cut'
 
-#filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/allFilteredReads_20_03_2008'
+filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/allReads.full.newid'
+#remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/spliced.result_no_comm'
+remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/map_2nd.vm_chr'
 
-filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.full.perm'
-remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/map.vm_cut'
-
-compile_d(Conf.gff_fn,Conf.dna_flat_fn,filtered_fn,remapped_fn,Conf.tmp_dir,'dataset_remapped_test_new',True)
+compile_d2(Conf.gff_fn,Conf.dna_flat_fn,filtered_fn,remapped_fn,'dataset_2nd',True)