+ changed PipelinHeuristic to support new data access functions
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 31 Jul 2008 09:22:34 +0000 (09:22 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 31 Jul 2008 09:22:34 +0000 (09:22 +0000)
+ fixed bug in the split_file utility function
+ minor changes in grid_ file

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@10267 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/PipelineHeuristic.py
scripts/Utils.py
scripts/grid_heuristic.py

index 5f7a2f6..93d6265 100644 (file)
@@ -6,7 +6,6 @@ import sys
 import pdb
 import os
 import os.path
-import math
 import resource
 
 from qpalma.computeSpliceWeights import *
@@ -20,18 +19,18 @@ from qpalma.parsers import *
 
 from ParaParser import *
 
-from Lookup import Lookup
+from qpalma.Lookup import LookupTable
 
 from qpalma.sequence_utils import reverse_complement,unbracket_seq
 
 
-
 class BracketWrapper:
-   fields = ['id', 'chr', 'pos', 'strand', 'mismatches', 'length',\
-   'offset', 'seq', 'prb', 'cal_prb', 'chastity']
+   #fields = ['id', 'chr', 'pos', 'strand', 'mismatches', 'length',\
+   #'offset', 'seq', 'prb', 'cal_prb', 'chastity']
+   fields = ['id', 'chr', 'pos', 'strand', 'seq', 'prb']
 
    def __init__(self,filename):
-      self.parser = ParaParser("%lu%d%d%s%d%d%d%s%s%s%s",self.fields,len(self.fields),IN_VECTOR)
+      self.parser = ParaParser("%lu%d%d%s%s%s",self.fields,len(self.fields),IN_VECTOR)
       self.parser.parseFile(filename)
 
    def __len__(self):
@@ -68,23 +67,36 @@ class PipelineHeuristic:
       run = cPickle.load(open(run_fname))
       self.run = run
 
-      dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-
-
       start = cpu()
-
       # old version
       #self.all_remapped_reads = parse_map_vm_heuristic(data_fname)
       self.all_remapped_reads = BracketWrapper(data_fname)
-      
       stop = cpu()
 
       print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
 
+      #g_dir    = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
+      #acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
+      #don_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
+
+      #g_fmt = 'contig%d.dna.flat'
+      #s_fmt = 'contig_%d%s'
+
+      #self.chromo_range = range(0,1099)
+
+      g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+      g_fmt = 'chr%d.dna.flat'
+      s_fmt = 'contig_%d%s'
+
+      self.chromo_range = range(1,6)
 
       start = cpu()
-      self.lt1 = Lookup(dna_flat_files)
+      self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,self.chromo_range)
       stop = cpu()
+
       print 'prefetched sequence and splice data in %f sec' % (stop-start)
 
       self.result_spliced_fh = open('%s.spliced'%result_fname,'w+')
@@ -105,7 +117,7 @@ class PipelineHeuristic:
       self.mmatrix = mmatrix
       self.qualityPlifs = qualityPlifs
 
-      self.read_size    = 36
+      self.read_size    = 38
 
       # parameters of the heuristics to decide whether the read is spliced
       #self.splice_thresh      = 0.005
@@ -212,13 +224,13 @@ class PipelineHeuristic:
          chr      = location['chr']
          pos      = location['pos']
          strand   = location['strand']
-         mismatch = location['mismatches']
-         length   = location['length']
-         off      = location['offset']
+         #mismatch = location['mismatches']
+         #length   = location['length']
+         #off      = location['offset']
          seq      = location['seq']
          prb      = location['prb']
-         cal_prb  = location['cal_prb']
-         chastity = location['chastity']
+         #cal_prb  = location['cal_prb']
+         #chastity = location['chastity']
 
          id = int(id)
 
@@ -228,14 +240,14 @@ class PipelineHeuristic:
 
          strand = strand_map[strand]
 
-         if not chr in range(1,6):
+         if not chr in self.chromo_range:
             continue
 
          unb_seq = unbracket_seq(seq)
 
          # forgot to do this
          if strand == '-':
-            pos = self.lt1.seqInfo.chromo_sizes[chr+7]-pos-self.read_size
+            pos = self.lt1.seqInfo.chromo_sizes[chr]-pos-self.read_size
             unb_seq = reverse_complement(unb_seq)
 
          effective_len = len(unb_seq)
@@ -256,7 +268,7 @@ class PipelineHeuristic:
          exons[1,0]     = effective_len
          est            = unb_seq
          original_est   = seq
-         quality        = prb
+         quality        = map(lambda x:ord(x)-50,prb)
 
          #pdb.set_trace()
 
@@ -386,7 +398,8 @@ class PipelineHeuristic:
       strand   = location['strand']
       original_est = location['seq']
       quality      = location['prb']
-      cal_prb  = location['cal_prb']
+      quality        = map(lambda x:ord(x)-50,quality)
+      #cal_prb  = location['cal_prb']
       
       original_est = original_est.lower()
       est = unbracket_seq(original_est)
index ea4fb26..7a54ed5 100644 (file)
@@ -148,7 +148,11 @@ def get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
 
 ##########
 
-def split_file(filename,result_dir,parts):
+def split_file(filename,result_dir,num_parts):
+   """
+   Splits a file of n lines into num_parts parts
+   """
+
    jp = os.path.join
 
    all_intervals = []
@@ -160,20 +164,19 @@ def split_file(filename,result_dir,parts):
 
    print 'found %d lines' % line_ctr
 
-   part = line_ctr / parts
+   part_size = line_ctr / num_parts
    begin = 0
    end = 0
-   for idx in range(1,parts+1):
-      
-      if idx == parts:
-         begin = end
-         end   = line_ctr
+
+   for idx in range(1,num_parts+1):
+      begin = end
+
+      if idx == num_parts:
+         end = line_ctr
       else:
-         begin = end
-         end = begin+part
+         end = begin+part_size+1
 
-      params = (begin,end)
-      all_intervals.append(params)
+      all_intervals.append((begin,end))
 
    parts_fn = []
    for pos,params in enumerate(all_intervals):
@@ -189,19 +192,26 @@ def split_file(filename,result_dir,parts):
    lineCtr = 0
    beg = -1
    end = -1
-   for line in open(filename,'r'):
+   in_fh = open(filename,'r')
+   while True:
+      line = in_fh.readline()
+      if line == '':
+         break
+
       if beg <= lineCtr < end:
          out_fh.write(line)
          lineCtr += 1
       else:
-         params = all_intervals.pop()
-         beg,end = params
+         (beg,end) = all_intervals.pop()
          out_fn = parts_fn.pop()
          out_fh.close()
          out_fh = open(out_fn,'w+')
+         out_fh.write(line)
 
    out_fh.close()
 
 
 if __name__ == '__main__':
-   split_file_join_results('/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map.vm',10)
+   split_file('/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm','/tmp',25)
+   
+
index 78f9bd6..424a61f 100644 (file)
@@ -30,18 +30,26 @@ def g_heuristic(run_fname,data_fname,param_fname,result_fname):
 def create_and_submit():
    jp = os.path.join
 
+   num_splits = 25
+
    run_dir  = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
-   data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_0'
+   #data_dir = '/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/'
+
+   data_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
+
 
    run_fname      = jp(run_dir,'run_obj.pickle')
-   original_map_fname = '/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_44/4/length_38/main/map.vm'
-   #split_file(original_map_fname,data_dir,25)
+
+   #original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm'
+   #original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/new_map.vm'
+   original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main/map.vm'
+   split_file(original_map_fname,data_dir,num_splits)
 
    param_fname    = jp(run_dir,'param_526.pickle')
 
    functionJobs=[]
 
-   for idx in range(3,25):
+   for idx in range(0,num_splits):
       data_fname     = jp(data_dir,'map.part_%d'%idx)
       result_fname   = jp(data_dir,'map.vm.part_%d.heuristic'%idx)
 
@@ -49,11 +57,12 @@ def create_and_submit():
 
       current_job = KybJob(grid_heuristic.g_heuristic,[run_fname,data_fname,param_fname,result_fname])
       current_job.h_vmem = '25.0G'
-      current_job.express = 'True'
+      #current_job.express = 'True'
 
       print "job #1: ", current_job.nativeSpecification
 
       functionJobs.append(current_job)
+      #break
 
    (sid, jobids) = submit_jobs(functionJobs)
    #print 'checking whether finished'