+ update makefiles to fetch automatically valid Python includes and libs
[qpalma.git] / qpalma / DatasetUtils.py
index 6c41062..d16eb3c 100644 (file)
@@ -8,6 +8,7 @@
 
 import array
 import cPickle
+import numpy
 import os
 import os.path
 import pdb
@@ -70,16 +71,20 @@ def generateTrainingDataset(settings):
 
    """
 
-   dataset = []
+   dataset = {}
 
    half_window_size = settings['half_window_size']
 
+   # This tuple specifies an interval for valid Illumina Genome Analyzer quality values
+   if settings['platform'] == 'IGA':
+      quality_interval = illumina_ga_range
+
    instance_counter = 0
 
    accessWrapper = DataAccessWrapper(settings)
    seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
 
-   for line in open(map_file):
+   for line in open(settings['training_reads_fn']):
       line = line.strip()
       if line.startswith('#') or line == '':
          continue
@@ -97,13 +102,21 @@ def generateTrainingDataset(settings):
          continue
 
       # convert D/P strand info to +/-
-      strand = ['-','+'][slist[2] == 'D']
+      strand = slist[2]
+
+      if strand in ['-','+']:
+         pass
+      elif strand in ['D','P']:
+         strand = ['-','+'][strand == 'D']
+      else:
+         print 'Error strand information has to be either +/- or D/P'
+         sys.exit(1)
 
       seqBeginning   = int(slist[3])
       seqEnd         = int(slist[4])
 
       readAlignment  = slist[5]
-      prb = processQuality(slist[6],prb_offset,quality_intervals,perform_checks)
+      prb = processQuality(slist[6],settings['prb_offset'],quality_interval,settings['perform_checks'])
       
       exons = numpy.mat([0,0,0,0]).reshape((2,2))
 
@@ -112,14 +125,16 @@ def generateTrainingDataset(settings):
       exons[1,0]     = int(slist[9])
       exons[1,1]     = int(slist[10])
 
-      dna = seqInfo.get_seq_and_scores(chr,strand,up_cut,down_cut,only_seq=True)
-      relative_exons = exons - up_cut
+      dna = seqInfo.get_seq_and_scores(chromo,strand,seqBeginning,seqEnd,only_seq=True)
+      relative_exons = exons - seqBeginning
 
       assert checkExons(dna,relative_exons,readAlignment,id)
 
-      currentSeqInfo = (id,chromo)
+      currentSeqInfo = (id,chromo,strand,seqBeginning,seqEnd)
+
+      dataset[id] = (currentSeqInfo,readAlignment,[prb],exons)
       
-      dataset.setdefault(id, []).append((currentSeqInfo,readAlignment,[prb],exons))
+      # dataset.setdefault(id, []).append()
 
    saveData('training',dataset,settings)
 
@@ -127,10 +142,6 @@ def generateTrainingDataset(settings):
 def addToDataset(map_file,dataset,settings):
    assert os.path.exists(map_file), 'Error: Can not find map file'
 
-   perform_checks = settings['perform_checks']
-
-   prb_offset = settings['prb_offset']
-
    # This tuple specifies an interval for valid Illumina Genome Analyzer quality values
    if settings['platform'] == 'IGA':
       quality_interval = illumina_ga_range
@@ -188,7 +199,7 @@ def addToDataset(map_file,dataset,settings):
 
       dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
       
-      prb = processQuality(slist[5],prb_offset,quality_interval,perform_checks)
+      prb = processQuality(slist[5],settings['prb_offset'],quality_interval,settings['perform_checks'])
    
       currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)