+ changed dataset compilation to support new framework
authorFabio <fabio@congo.fml.local>
Wed, 10 Sep 2008 21:28:08 +0000 (23:28 +0200)
committerFabio <fabio@congo.fml.local>
Wed, 10 Sep 2008 21:28:08 +0000 (23:28 +0200)
qpalma/DatasetUtils.py [new file with mode: 0644]
qpalma/gridtools.py
qpalma/sequence_utils.py
scripts/PipelineHeuristic.py

diff --git a/qpalma/DatasetUtils.py b/qpalma/DatasetUtils.py
new file mode 100644 (file)
index 0000000..09d241c
--- /dev/null
@@ -0,0 +1,143 @@
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
+import array
+import cPickle
+import os
+import pdb
+
+#from parsers import *
+from sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
+
+
+def generateDataset(global_settings):
+   """
+   This function is responsible for the dataset generation for the matches of
+   the second vmatch run and the heuristic-filtered matches of the first run.
+
+   An example in our dataset consists of:
+
+   - information on the original read:
+         * id
+         * nucleotide sequence
+         * quality vectors (at the moment: prb and chastity)
+
+   - information on the target dna sequence:
+         * chromosome
+         * strand
+         * fragment start 
+         * fragment end positions
+
+   this information is stored in tuples which are stored then in a dictionary
+   indexed by the reads unique id.
+
+   CAUTION: The dictionary stores a list of the above defined tuples under each
+   'id'. This is because one read can be found several times on the genome.
+   """
+
+   assert os.path.exists(map_file), 'Error: Can not find map file'
+   print map_2nd_file
+   assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
+   self.map_file = map_file
+   self.map_2nd_file = map_2nd_file
+
+   dataset = {}
+
+   prb_offset = 64
+
+   # this means that around each vmatch hit we cut out a window of 1500 bases
+   # each up-/downstream.
+   half_window_size = 1500
+
+   instance_counter = 0
+
+   accessWrapper = DataAccessWrapper(settings)
+   seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
+
+   for line in open(map_file):
+      if instance_counter > 0 and instance_counter % 5000 == 0:
+         print 'processed %d examples' % instance_counter
+
+      slist    = line.split()
+
+      id       = int(slist[0])
+      chromo   = int(slist[1])
+      pos      = int(slist[2])
+
+      # for the time being we only consider chromosomes 1-5
+      if not chromo in range(1,6):
+         continue
+
+      # convert D/P strand info to +/-
+      strand = ['-','+'][slist[3] == 'D']
+
+      # QPalma uses lowercase characters
+      bracket_seq = slist[4].lower()
+      read_seq = unbracket_seq(bracket_seq)
+      reconstructed_dna_seq = reconstruct_dna_seq(bracket_seq)
+
+      if strand == '-' and first_round:
+         read_seq = reverse_complement(read_seq)
+         #reconstructed_dna_seq = reverse_complement(reconstructed_dna_seq)
+
+      assert not '[' in read_seq and not ']' in read_seq
+      
+      # we use an area of +/-  `self.half_window_size` nucleotides around the seed position
+      if pos > self.half_window_size+1:
+         us_offset = self.half_window_size
+      else:
+         us_offset = pos - 1 
+
+      if pos+self.half_window_size < seqInfo.chromo_sizes[chromo]:
+         ds_offset = self.half_window_size
+      else:
+         ds_offset = seqInfo.chromo_sizes[chromo]-pos-1
+         
+      genomicSeq_start = pos - us_offset
+      genomicSeq_stop  = pos + ds_offset
+
+      dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
+      
+      #if strand == '-':
+      #   dna_part = dna_seq[us_offset-len(read_seq):us_offset]
+      #else:
+      #   dna_part = dna_seq[us_offset:us_offset+len(read_seq)]
+
+      ## check whether everything is ok
+      #   try:
+      #      currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
+      #   except:
+      #      problem_ctr += 1
+      #      continue
+
+      # In order to save some space we use a signed char to store the
+      # qualities. Each quality element can range as follows: -128 <= elem <= 127
+      
+      prb = array.array('b',map(lambda x: ord(x)-self.prb_offset,slist[5]))
+
+      currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
+      currentQualities = [prb]
+
+      # As one single read can have several vmatch matches we store all these
+      # matches under the unique id of the read
+      dataset.setdefault(id, []).append((currentSeqInfo,read_seq,currentQualities))
+
+      instance_counter += 1
+
+   print 'Total examples processed: %d' % instance_counter
+
+   ddir = global_settings['dataset_dir']
+   dataset_fn        = jp(ddir,'data.pickle')
+   dataset_keys_fn   = jp(ddir,'data.keys.pickle')
+
+   assert not os.path.exists(dataset_fn), 'The data_file already exists!'
+   assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
+
+   # saving new dataset and single keys as well
+   cPickle.dump(dataset,open(dataset_fn,'w+'),protocol=2)
+   cPickle.dump(dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)
index a6b73d6..2733390 100644 (file)
@@ -132,7 +132,7 @@ class ApproximationTask(ClusterTask):
          data_fname     = jp(result_dir,'map.part_%d'%idx)
          result_fname   = jp(result_dir,'map.vm.part_%d.heuristic'%idx)
 
-         current_job = KybJob(gridtools.ApproximationTaskStarter,[run_fname,data_fname,param_fname,result_fname])
+         current_job = KybJob(gridtools.ApproximationTaskStarter,[run_fname,data_fname,param_fname,result_fname,self.global_settings])
          current_job.h_vmem = '25.0G'
          #current_job.express = 'True'
 
@@ -141,8 +141,8 @@ class ApproximationTask(ClusterTask):
          self.functionJobs.append(current_job)
 
 
-def ApproximationTaskStarter(run_fname,data_fname,param_fname,result_fname):
-   ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_fname)
+def ApproximationTaskStarter(run_fname,data_fname,param_fname,result_fname,settings):
+   ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_fname,settings)
    ph1.filter()
 
    return 'finished filtering set %s.' % data_fname
index 7c3bef4..54bc0d0 100644 (file)
@@ -196,18 +196,14 @@ class DataAccessWrapper:
    """
    The purpose of this class is to wrap the genomic/splice site score data
    access.
-
-   
-
    """
 
-
-   def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt):
-      self.genomic_dir        = genomic_data_dir
-      self.acc_score_dir      = acc_score_dir
-      self.don_score_dir      = don_score_dir
-      self.genomic_fmt        = gen_fmt
-      self.sscore_fmt         = sscore_fmt
+   def __init__(self,settings):
+      self.genomic_dir        = settings['genome_dir'] 
+      self.acc_score_dir      = settings['acceptor_scores_dir'] 
+      self.don_score_dir      = settings['donor_scores__dir'] 
+      self.genomic_fmt        = settings['genome_file_fmt'] 
+      self.sscore_fmt         = settings['splice_score_file_fmt'] 
 
       assert os.path.isdir(genomic_data_dir)
       assert os.path.isdir(acc_score_dir)
index f4050bc..472c0e7 100644 (file)
@@ -58,7 +58,7 @@ class PipelineHeuristic:
    vmatch is spliced an should be then newly aligned using QPalma or not.
    """
 
-   def __init__(self,run_fname,data_fname,param_fname,result_fname):
+   def __init__(self,run_fname,data_fname,param_fname,result_fname,settings):
       """
       We need a run object holding information about the nr. of support points
       etc.
@@ -73,28 +73,10 @@ class PipelineHeuristic:
 
       print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
 
-      #g_dir    = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
-      #acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
-      #don_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
+      self.chromo_range = settings['allowed_fragments']
 
-      #g_fmt = 'contig%d.dna.flat'
-      #s_fmt = 'contig_%d%s'
-
-      #self.chromo_range = range(0,1099)
-
-      g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
-      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
-
-      g_fmt = 'chr%d.dna.flat'
-      s_fmt = 'contig_%d%s'
-
-      self.chromo_range = range(1,6)
-      #self.chromo_range = range(1,2)
-
-      num_chromo = 2
-      accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
-      self.seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+      accessWrapper = DataAccessWrapper(settings)
+      seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
 
       start = cpu()
       self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,self.chromo_range)