+ changed output format of alignment file script to be compatible with est2gff
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 11 Jun 2008 15:36:05 +0000 (15:36 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 11 Jun 2008 15:36:05 +0000 (15:36 +0000)
+ modified PipelineHeuristic to support faster parsers

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9571 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/PipelineHeuristic.py
scripts/Utils.py
scripts/createAlignmentFileFromPrediction.py
scripts/grid_heuristic.py
scripts/grid_predict.py
scripts/qpalma_main.py
scripts/run_heuristic

index 40fdadb..5d036cd 100644 (file)
@@ -18,28 +18,55 @@ from qpalma.compute_donacc import *
 from qpalma.TrainingParam import Param
 from qpalma.Plif import Plf
 
 from qpalma.TrainingParam import Param
 from qpalma.Plif import Plf
 
-#from qpalma.tools.splicesites import getDonAccScores
 from qpalma.Configuration import *
 
 from qpalma.Configuration import *
 
-#from compile_dataset import getSpliceScores, get_seq_and_scores
-
 from numpy.matlib import mat,zeros,ones,inf
 from numpy import inf,mean
 
 from qpalma.parsers import *
 
 from numpy.matlib import mat,zeros,ones,inf
 from numpy import inf,mean
 
 from qpalma.parsers import *
 
+from ParaParser import *
+
 from Lookup import *
 
 from qpalma.sequence_utils import *
 
 
 from Lookup import *
 
 from qpalma.sequence_utils import *
 
 
+
+class BracketWrapper:
+   fields = ['id', 'chr', 'pos', 'strand', 'mismatches', 'length',\
+   'offset', 'seq', 'prb', 'cal_prb', 'chastity']
+
+   def __init__(self,filename):
+      self.parser = ParaParser("%lu%d%d%s%d%d%d%s%s%s%s",self.fields,len(self.fields),IN_VECTOR)
+      self.parser.parseFile(filename)
+
+   def __len__(self):
+      return self.parser.getSize(IN_VECTOR)
+      
+   def __getitem__(self,key):
+      return self.parser.fetchEntry(key)
+
+   def __iter__(self):
+      self.counter = 0
+      self.size = self.parser.getSize(IN_VECTOR)
+      return self
+
+   def next(self):
+      if not self.counter < self.size:
+         raise StopIteration
+      return_val = self.parser.fetchEntry(self.counter)
+      self.counter += 1
+      return return_val
+
+
 class PipelineHeuristic:
    """
    This class wraps the filter which decides whether an alignment found by
    vmatch is spliced an should be then newly aligned using QPalma or not.
    """
 
 class PipelineHeuristic:
    """
    This class wraps the filter which decides whether an alignment found by
    vmatch is spliced an should be then newly aligned using QPalma or not.
    """
 
-   def __init__(self,run_fname,data_fname,reads_pipeline_fn,param_fname,result_spliced_fname,result_unspliced_fname):
+   def __init__(self,run_fname,data_fname,param_fname,result_fname):
       """
       We need a run object holding information about the nr. of support points
       etc.
       """
       We need a run object holding information about the nr. of support points
       etc.
@@ -50,8 +77,13 @@ class PipelineHeuristic:
 
       dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
 
 
       dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
 
+
       start = cpu()
       start = cpu()
-      self.all_remapped_reads = parse_map_vm_heuristic(data_fname)
+
+      # old version
+      #self.all_remapped_reads = parse_map_vm_heuristic(data_fname)
+      self.all_remapped_reads = BracketWrapper(data_fname)
+      
       stop = cpu()
 
       print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
       stop = cpu()
 
       print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
@@ -62,8 +94,8 @@ class PipelineHeuristic:
       stop = cpu()
       print 'prefetched sequence and splice data in %f sec' % (stop-start)
 
       stop = cpu()
       print 'prefetched sequence and splice data in %f sec' % (stop-start)
 
-      self.result_spliced_fh = open(result_spliced_fname,'w+')
-      self.result_unspliced_fh = open(result_unspliced_fname,'w+')
+      self.result_spliced_fh = open('%s.spliced'%result_fname,'w+')
+      self.result_unspliced_fh = open('%s.unspliced'%result_fname,'w+')
 
       start = cpu()
 
 
       start = cpu()
 
@@ -80,7 +112,7 @@ class PipelineHeuristic:
       self.mmatrix = mmatrix
       self.qualityPlifs = qualityPlifs
 
       self.mmatrix = mmatrix
       self.qualityPlifs = qualityPlifs
 
-      self.read_size    = 36
+      #self.read_size    = 36
 
       # parameters of the heuristics to decide whether the read is spliced
       #self.splice_thresh      = 0.005
 
       # parameters of the heuristics to decide whether the read is spliced
       #self.splice_thresh      = 0.005
@@ -108,13 +140,14 @@ class PipelineHeuristic:
          self.result_spliced_fh.write(p_line)
          self.result_unspliced_fh.write(p_line)
    
          self.result_spliced_fh.write(p_line)
          self.result_unspliced_fh.write(p_line)
    
-      self.original_reads = {}
+      #self.original_reads = {}
 
 
-      for line in open(reads_pipeline_fn):
-         line = line.strip()
-         id,seq,q1,q2,q3 = line.split()
-         id = int(id)
-         self.original_reads[id] = seq
+      # we do not have this information
+      #for line in open(reads_pipeline_fn):
+      #   line = line.strip()
+      #   id,seq,q1,q2,q3 = line.split()
+      #   id = int(id)
+      #   self.original_reads[id] = seq
 
       lengthSP    = run['numLengthSuppPoints']
       donSP       = run['numDonSuppPoints']
 
       lengthSP    = run['numLengthSuppPoints']
       donSP       = run['numDonSuppPoints']
@@ -203,7 +236,10 @@ class PipelineHeuristic:
 
          strand = strand_map[strand]
 
 
          strand = strand_map[strand]
 
-         if strand == '-':
+         #if strand == '-':
+         #   continue
+
+         if not chr in range(1,6):
             continue
 
          unb_seq = unbracket_est(seq)
             continue
 
          unb_seq = unbracket_est(seq)
@@ -578,28 +614,20 @@ def cpu():
 
 
 if __name__ == '__main__':
 
 
 if __name__ == '__main__':
-   if len(sys.argv) != 7:
-      print 'Usage: ./%s data reads param run spliced.results unspliced.results' % (sys.argv[0])
+   if len(sys.argv) != 6:
+      print 'Usage: ./%s data param run spliced.results unspliced.results' % (sys.argv[0])
+      sys.exit(1)
 
    data_fname = sys.argv[1]
 
    data_fname = sys.argv[1]
-   reads_pipeline_fn = sys.argv[2]
-   param_fname = sys.argv[3]
-   run_fname = sys.argv[4]
+   param_fname = sys.argv[2]
+   run_fname = sys.argv[3]
 
 
-   result_spliced_fname    = sys.argv[5]
-   result_unspliced_fname  = sys.argv[6]
+   result_spliced_fname    = sys.argv[4]
+   result_unspliced_fname  = sys.argv[5]
 
    jp = os.path.join
 
 
    jp = os.path.join
 
-   #dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_test/run_+_quality_+_splicesignals_+_intron_len'
-   #param_fname = jp(dir,'param_500.pickle')
-   #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_20k'
-   #data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_2k'
-   #data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_100'
-   #result_spliced_fname    = 'splicedReads.heuristic'
-   #result_unspliced_fname  = 'unsplicedReads.heuristic'
-
-   ph1 = PipelineHeuristic(run_fname,data_fname,reads_pipeline_fn,param_fname,result_spliced_fname,result_unspliced_fname)
+   ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_spliced_fname,result_unspliced_fname)
 
    start = cpu()
    ph1.filter()
 
    start = cpu()
    ph1.filter()
index df46f4b..74d83a4 100644 (file)
@@ -1,6 +1,8 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+import pdb
+
 from numpy.matlib import mat,zeros,ones,inf
 
 import palma.palma_utils as pu
 from numpy.matlib import mat,zeros,ones,inf
 
 import palma.palma_utils as pu
@@ -144,7 +146,6 @@ def get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
 
 ##########
 
 
 ##########
 
-
 def split_file_join_results(filename,parts):
    all_intervals = []
 
 def split_file_join_results(filename,parts):
    all_intervals = []
 
@@ -153,6 +154,8 @@ def split_file_join_results(filename,parts):
    for line in open(filename,'r'):
       line_ctr += 1
 
    for line in open(filename,'r'):
       line_ctr += 1
 
+   print 'found %d lines' % line_ctr
+
    part = line_ctr / parts
    begin = 0
    end = 0
    part = line_ctr / parts
    begin = 0
    end = 0
@@ -166,9 +169,12 @@ def split_file_join_results(filename,parts):
          end = begin+part
 
       params = (begin,end)
          end = begin+part
 
       params = (begin,end)
+      print begin,end
 
       all_intervals.append(params)
 
 
       all_intervals.append(params)
 
+   pdb.set_trace()
+
    infile = open(filename,'r')
 
    parts_fn = []
    infile = open(filename,'r')
 
    parts_fn = []
@@ -199,4 +205,5 @@ def split_file_join_results(filename,parts):
    out_fh.close()
 
 
    out_fh.close()
 
 
-
+if __name__ == '__main__':
+   split_file_join_results('/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map.vm',10)
index 5da7bb2..9e97190 100644 (file)
@@ -7,18 +7,23 @@ import pdb
 import os
 import os.path
 import math
 import os
 import os.path
 import math
-from qpalma.parsers import *
+import numpy
+
+#from qpalma.parsers import *
 
 from Evaluation import load_chunks
 
 from Evaluation import load_chunks
+import qparser
+
 
 
-def prediction_on(current_dir,filtered_reads,out_fname):
-    
-   import qparser
-   qparser.parse_reads(filtered_reads)
+def create_aglignment_file(allPredictions,out_fname):
+   import numpy
+   #qparser.parse_reads(filtered_reads)
 
    out_fh = open(out_fname,'w+')
 
 
    out_fh = open(out_fname,'w+')
 
-   allPredictions = load_chunks(current_dir)
+   #allPredictionsmllPredictions = load_chunks(current_loc)
+   #allPredictions = cPickle.load(open(current_loc))
+
    allPositions = {}
 
    for current_prediction in allPredictions:
    allPositions = {}
 
    for current_prediction in allPredictions:
@@ -28,12 +33,14 @@ def prediction_on(current_dir,filtered_reads,out_fname):
       #true_cut = current_ground_truth['true_cut']
       #seq = current_ground_truth['seq']
       #q1 = current_ground_truth['prb']
       #true_cut = current_ground_truth['true_cut']
       #seq = current_ground_truth['seq']
       #q1 = current_ground_truth['prb']
+
       seq         = current_prediction['est']
       seq         = current_prediction['est']
+      dna         = current_prediction['dna']
 
       # CHECK !!!
       q1          = 'zzz'
 
 
       # CHECK !!!
       q1          = 'zzz'
 
-      chr         = current_prediction['chr']
+      chromo      = current_prediction['chr']
       strand      = current_prediction['strand']
       start_pos   = current_prediction['start_pos']
       alignment   = current_prediction['alignment']
       strand      = current_prediction['strand']
       start_pos   = current_prediction['start_pos']
       alignment   = current_prediction['alignment']
@@ -41,21 +48,80 @@ def prediction_on(current_dir,filtered_reads,out_fname):
       predExons = current_prediction['predExons']
       predExons = [e+start_pos for e in predExons]
 
       predExons = current_prediction['predExons']
       predExons = [e+start_pos for e in predExons]
 
-      p_start  = current_ground_truth['p_start']
-      e_stop   = current_ground_truth['exon_stop']
-      e_start  = current_ground_truth['exon_start']
-      p_stop   = current_ground_truth['p_stop']
+      #p_start  = current_ground_truth['p_start']
+      #e_stop   = current_ground_truth['exon_stop']
+      #e_start  = current_ground_truth['exon_start']
+      #p_stop   = current_ground_truth['p_stop']
+
+      # &match, &mismatch, &repmatch, &Ns, &Qgapcnt, &Qgapbs, &Tgapcnt, &Tgapbs,                                                                                                                                              
+      predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
+      #predExons = predExons.T
+
+      #print predExons
+      #pdb.set_trace()
+
+      match = predExons[0,1] - predExons[0,0] 
+      if predExons.shape == (2,2):
+         match += predExons[1,1] - predExons[1,0] 
+
+      mismatch = 0
+      repmatch = 0
+      Ns = 0
+      Qgapcnt = 0
+      Qgapbs = 0
+
+      Tgapcnt = 0
+      Tgapbs = 0
 
 
-      correct_prediction = False
+      if predExons.shape == (2,2):
+         Tgapbs += predExons[1,0] - predExons[0,1]
+         Tgapcnt = 1
+
+      # &strand, Qname, &Qsize, 
+      
+      # &Qstart, &Qend, Tname, &Tsize,
+      # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
 
       (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
       tExonSizes,tStarts, tEnds) = alignment 
 
 
       (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
       tExonSizes,tStarts, tEnds) = alignment 
 
-      new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
-      (id,chr,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
-      str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
-      str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
-      str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
+      #
+      # ATTENTION
+      #  
+      # we enforce equal exons sizes for q and t because we are missing indel
+      # positions of the alignment
+
+      if qExonSizes[0] != tExonSizes[0]:
+         continue
+
+      Qname = str(id)
+      Qsize = len(seq)
+      Qstart = qStart
+      Qend = qEnd
+      Tname = 'CHR%d'%chromo
+
+      start_pos -= 2
+
+      Tsize = tEnd+1 + start_pos
+      Tstart =  tStart + start_pos
+      Tend = tEnd + start_pos
+      blockcnt = Tgapcnt+1
+      exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
+      Qstarts_str = str(qStarts)[1:-1].replace(' ','')
+      Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
+
+      #new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
+      #(id,chr,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+      #str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
+      #str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
+      #str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
+
+      new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
+      % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
+      Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
+      Tname, Tsize, Tstart, Tend,\
+      blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
+
 
       out_fh.write(new_line)
 
 
       out_fh.write(new_line)
 
@@ -63,6 +129,5 @@ def prediction_on(current_dir,filtered_reads,out_fname):
 
 if __name__ == '__main__':
    current_dir = sys.argv[1]
 
 if __name__ == '__main__':
    current_dir = sys.argv[1]
-   filtered_reads = sys.argv[2]
-   out_fh = sys.argv[3]
-   prediction_on(current_dir,filtered_reads,out_fh)
+   out_fh = sys.argv[2]
+   create_aglignment_file(current_dir,out_fh)
index 9c211c7..64381d1 100644 (file)
@@ -40,7 +40,7 @@ def create_and_submit():
       data_fname     = jp(data_dir,'map.vm.part_%d'%idx)
       result_fname   = jp(data_dir,'map.vm.part_%d.heuristic'%idx)
 
       data_fname     = jp(data_dir,'map.vm.part_%d'%idx)
       result_fname   = jp(data_dir,'map.vm.part_%d.heuristic'%idx)
 
-      current_job = KybJob(grid_predict.g_heuristic,[run_fname,data_fname,param_fname,result_fname])
+      current_job = KybJob(grid_heuristic.g_heuristic,[run_fname,data_fname,param_fname,result_fname])
       current_job.h_vmem = '25.0G'
       #current_job.express = 'True'
 
       current_job.h_vmem = '25.0G'
       #current_job.express = 'True'
 
index 54ed50a..2afeb7c 100644 (file)
@@ -45,7 +45,7 @@ def makeJobs(run,dataset_fn,chunks,param):
 
    for c_name,current_chunk in chunks:
       current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param,c_name])
 
    for c_name,current_chunk in chunks:
       current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param,c_name])
-      current_job.h_vmem = '5.0G'
+      current_job.h_vmem = '12.0G'
       #current_job.express = 'True'
 
       print "job #1: ", current_job.nativeSpecification
       #current_job.express = 'True'
 
       print "job #1: ", current_job.nativeSpecification
@@ -67,14 +67,14 @@ def create_and_submit():
    run   = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
    param = cPickle.load(open(jp(run_dir,'param_526.pickle')))
 
    run   = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
    param = cPickle.load(open(jp(run_dir,'param_526.pickle')))
 
-   dataset_fn           = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_12_05_08.test.pickle'
-   prediction_keys_fn   = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_12_05_08.test_keys.pickle'
+   dataset_fn           = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/dataset_transcriptome_run_1.pickle'
+   prediction_keys_fn   = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/dataset_transcriptome_run_1.keys.pickle'
 
    prediction_keys = cPickle.load(open(prediction_keys_fn))
 
    print 'Found %d keys for prediction.' % len(prediction_keys)
 
 
    prediction_keys = cPickle.load(open(prediction_keys_fn))
 
    print 'Found %d keys for prediction.' % len(prediction_keys)
 
-   num_splits = 15
+   num_splits = 25
    slices = get_slices(len(prediction_keys),num_splits)
    chunks = []
    for idx,slice in enumerate(slices):
    slices = get_slices(len(prediction_keys),num_splits)
    chunks = []
    for idx,slice in enumerate(slices):
index d32a2e7..e71f623 100644 (file)
@@ -20,7 +20,7 @@ import pdb
 import re
 import os.path
 
 import re
 import os.path
 
-from qpalma.sequence_utils import getSpliceScores, get_seq_and_scores
+from qpalma.sequence_utils import *
 
 import numpy
 from numpy.matlib import mat,zeros,ones,inf
 
 import numpy
 from numpy.matlib import mat,zeros,ones,inf
@@ -53,24 +53,6 @@ class SpliceSiteException:
    pass
 
 
    pass
 
 
-def unbracket_est(est):
-   new_est = ''
-   e = 0
-
-   while True:
-      if e >= len(est):
-         break
-
-      if est[e] == '[':
-         new_est += est[e+2]
-         e += 4
-      else:
-         new_est += est[e]
-         e += 1
-
-   return "".join(new_est).lower()
-
-
 def getData(training_set,exampleKey,run):
    currentSeqInfo,currentExons,original_est,currentQualities = training_set[exampleKey]
    id,chr,strand,up_cut,down_cut = currentSeqInfo
 def getData(training_set,exampleKey,run):
    currentSeqInfo,currentExons,original_est,currentQualities = training_set[exampleKey]
    id,chr,strand,up_cut,down_cut = currentSeqInfo
@@ -734,7 +716,7 @@ class QPalma:
       if '-' in est:
          self.plog('found gap\n')
          est = est.replace('-','')
       if '-' in est:
          self.plog('found gap\n')
          est = est.replace('-','')
-         assert len(est) == 36
+         assert len(est) == Conf.read_size
 
       dna_len = len(dna)
       est_len = len(est)
 
       dna_len = len(dna)
       est_len = len(est)
index 53e5803..e855835 100755 (executable)
@@ -1,6 +1,6 @@
 #!/bin/bash
 
 run_dir=/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+
 #!/bin/bash
 
 run_dir=/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+
-data_dir=/fml/ag-raetsch/share/projects/qpalma/solexa/new_run
+data_dir=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data
 
 
-python PipelineHeuristic.py $data_dir/map.vm $data_dir/allReads.pipeline $run_dir/param_526.pickle $run_dir/run_obj.pickle $data_dir/spliced.heuristic $data_dir/unspliced.heuristic 
+python PipelineHeuristic.py $data_dir/map.vm $run_dir/param_526.pickle $run_dir/run_obj.pickle $data_dir/spliced.heuristic $data_dir/unspliced.heuristic