+ set proper logging in optimizer code
[qpalma.git] / qpalma / DatasetUtils.py
index 669d619..6c41062 100644 (file)
@@ -12,7 +12,7 @@ import os
 import os.path
 import pdb
 
-from sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
+from sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
 
 jp = os.path.join
 
@@ -20,18 +20,115 @@ illumina_ga_range = (-5,40)
 #roche_454_range   =
 
 
+def processQuality(raw_qualities,prb_offset,quality_interval,perform_checks):
+   """
+   In order to save some space we use a signed char to store the
+   qualities. Each quality element can range as follows: -128 <= elem <= 127
+   """
+
+   q_values = map(lambda x: ord(x)-prb_offset,raw_qualities)
+
+   if perform_checks:
+      for entry in q_values:
+         assert quality_interval[0] <= entry <= quality_interval[1], 'Error: Your read has invalid quality values: %d' % entry
+
+   return array.array('b',q_values)
+
+
+def checkExons(dna,exons,readAlignment,exampleKey):
+   """
+    
+   """
+
+   fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
+  
+   donor_elem = dna[exons[0,1]:exons[0,1]+2]
+   acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
+
+   if not ( donor_elem == 'gt' or donor_elem == 'gc' ):
+      print 'invalid donor in example %d'% exampleKey
+      return False
+
+   if not ( acceptor_elem == 'ag' ):
+      print 'invalid acceptor in example %d'% exampleKey
+      return False
+
+   read = unbracket_seq(readAlignment)
+   read = read.replace('-','')
+
+   assert len(fetched_dna_subseq) == len(read), pdb.set_trace()
+
+   return True
+
+
 def generateTrainingDataset(settings):
    """
    This function creates a training dataset.
+
+
+   Create lower case original ("bracketed") reads
+
    """
+
    dataset = []
 
+   half_window_size = settings['half_window_size']
+
+   instance_counter = 0
+
+   accessWrapper = DataAccessWrapper(settings)
+   seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
+
+   for line in open(map_file):
+      line = line.strip()
+      if line.startswith('#') or line == '':
+         continue
+
+      if instance_counter > 0 and instance_counter % 5000 == 0:
+         print 'processed %d examples' % instance_counter
+
+      slist    = line.split()
+
+      id       = int(slist[0])
+      chromo   = int(slist[1])
+
+      # for the time being we only consider chromosomes 1-5
+      if not chromo in settings['allowed_fragments']:
+         continue
+
+      # convert D/P strand info to +/-
+      strand = ['-','+'][slist[2] == 'D']
+
+      seqBeginning   = int(slist[3])
+      seqEnd         = int(slist[4])
+
+      readAlignment  = slist[5]
+      prb = processQuality(slist[6],prb_offset,quality_intervals,perform_checks)
+      
+      exons = numpy.mat([0,0,0,0]).reshape((2,2))
+
+      exons[0,0]     = int(slist[7])
+      exons[0,1]     = int(slist[8])
+      exons[1,0]     = int(slist[9])
+      exons[1,1]     = int(slist[10])
+
+      dna = seqInfo.get_seq_and_scores(chr,strand,up_cut,down_cut,only_seq=True)
+      relative_exons = exons - up_cut
+
+      assert checkExons(dna,relative_exons,readAlignment,id)
+
+      currentSeqInfo = (id,chromo)
+      
+      dataset.setdefault(id, []).append((currentSeqInfo,readAlignment,[prb],exons))
+
    saveData('training',dataset,settings)
 
 
 def addToDataset(map_file,dataset,settings):
    assert os.path.exists(map_file), 'Error: Can not find map file'
 
+   perform_checks = settings['perform_checks']
+
    prb_offset = settings['prb_offset']
 
    # This tuple specifies an interval for valid Illumina Genome Analyzer quality values
@@ -48,7 +145,8 @@ def addToDataset(map_file,dataset,settings):
    seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
 
    for line in open(map_file):
-      if line.startswith('#'):
+      line = line.strip()
+      if line.startswith('#') or line == '':
          continue
 
       if instance_counter > 0 and instance_counter % 5000 == 0:
@@ -83,30 +181,20 @@ def addToDataset(map_file,dataset,settings):
       if pos+half_window_size < seqInfo.getFragmentSize(chromo):
          ds_offset = half_window_size
       else:
-         ds_offset = seqInfo.chromo_sizes[chromo]-pos-1
+         ds_offset = seqInfo.getFragmentSize(chromo)-pos-1
          
       genomicSeq_start = pos - us_offset
       genomicSeq_stop  = pos + ds_offset
 
       dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
       
-      # In order to save some space we use a signed char to store the
-      # qualities. Each quality element can range as follows: -128 <= elem <= 127
-      
-      q_values = map(lambda x: ord(x)-prb_offset,slist[5])
-
-      if settings['perform_checks']:
-         for entry in q_values:
-            assert quality_interval[0] <= entry <= quality_interval[1], 'Error: Your read has invalid quality values: %d' % entry
-
-      prb = array.array('b',q_values)
-
+      prb = processQuality(slist[5],prb_offset,quality_interval,perform_checks)
+   
       currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
-      currentQualities = [prb]
 
       # As one single read can have several vmatch matches we store all these
       # matches under the unique id of the read
-      dataset.setdefault(id, []).append((currentSeqInfo,read_seq,currentQualities))
+      dataset.setdefault(id, []).append((currentSeqInfo,read_seq,[prb]))
 
       instance_counter += 1