+ added basic scripts to generate runs for an experiment
[qpalma.git] / scripts / compile_dataset.py
index 0250742..cf79622 100644 (file)
@@ -21,6 +21,14 @@ from genome_utils import load_genomic
 
 import qpalma.Configuration as Conf
 
+warning = """
+   #########################################################
+   WARNING ! Disabled the use of non-bracket reads WARNING !
+   #########################################################
+   
+   """
+
+
 help = """
 
    Usage of this script is:
@@ -118,6 +126,7 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
          pdb.set_trace()
 
       if currentFRead['strand'] != '+':
+         #print 'wrong strand'
          continue
 
       seq, acc, don, exons, est, original_est, qual, spos = process_read(currentRReads,currentFRead,currentGene,dna_flat_files,test)
@@ -140,14 +149,13 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
       if instance_counter % 100 == 0:
          print 'processed %d examples' % instance_counter
 
-      if instance_counter == 1000:
+      if instance_counter == 20000:
          break
 
-
    print 'Full dataset has size %d' % len(Sequences)
 
    dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
-   'Exons':Exons, 'Ests':Ests, 'Qualities':Qualities,\
+   'Exons':Exons, 'Ests':Ests, 'OriginalEsts':OriginalEsts, 'Qualities':Qualities,\
    'SplitPositions':SplitPositions,'Ids':Ids}
 
    # saving new dataset
@@ -160,36 +168,47 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    by using a 
 
    """
-   nil = ('','','','','','','')
+   nil = ('','','','','','','','')
    extended_alphabet = ['-','a','c','g','t','n','[',']']
    alphabet = ['-','a','c','g','t','n']
 
    chr            = fRead['chr']
    strand         = fRead['strand']
    quality        = fRead['prb']
+   #quality        = fRead['cal_prb']
+   #quality        = fRead['chastity']
    spos           = fRead['splitpos']
    currentReadSeq = fRead['seq']
    currentReadSeq = currentReadSeq.lower()
 
+   #if currentReadSeq.find('[') == -1:
+   #   print warning
+   #   return nil
+
    originalReadSeq = fRead['seq']
 
    chrom             = 'chr%d' % chr
    sscore_filename   = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s' % (chr,strand)
 
    # use matlab-style functions to access dna sequence
-   currentDNASeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=True)
+   currentDNASeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=False)
    currentDNASeq = currentDNASeq.lower()
 
    assert currentDNASeq != '', 'load_genomic returned empty sequence!'
 
    seq_has_indels = False
    for elem in currentReadSeq:
-      assert elem in extended_alphabet, 'Invalid char (extended_alphabet) in read: \"%s\"' % elem
+      #assert elem in extended_alphabet, 'Invalid char (extended_alphabet) in read: \"%s\"' % elem
+      if not elem in extended_alphabet:
+         return nil
+
       if elem ==  ']':
          seq_has_indels = True
 
    for elem in currentDNASeq:
-      assert elem in alphabet, 'Invalid char in dna: \"%s\"' % elem
+      #assert elem in alphabet, 'Invalid char in dna: \"%s\"' % elem
+      if not elem in alphabet:
+         return nil
 
    if strand == '-':
       try:
@@ -198,9 +217,30 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
          print 'invalid char'
          return nil
 
+   currentDNASeq = 'a' + currentDNASeq
+
    exon_stop   = fRead['exon_stop']
    exon_start  = fRead['exon_start']
 
+   # assertions to check whether exon start/stop positions are next to donor
+   # resp. acceptor sites
+   exonEnd     = exon_stop - currentGene.start
+   exonBegin   = exon_start - currentGene.start 
+
+   current_don = currentDNASeq[exonEnd+1:exonEnd+3]
+   current_acc = currentDNASeq[exonBegin-2:exonBegin]
+   if not current_don == 'gt' or current_don == 'gc':
+      return nil
+
+   if not current_acc == 'ag':
+      return nil
+
+   #assert current_don == 'gt' or current_don == 'gc', pdb.set_trace()
+   #assert current_acc == 'ag', pdb.set_trace()
+
+   # assertions for don/acc sites held now we recalculate offset
+   exon_stop   = exon_stop - currentGene.start
+   exon_start  = exon_start - currentGene.start 
 
    if not seq_has_indels:
       p_start     = fRead['p_start']
@@ -212,58 +252,95 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
 
    else:
       # if we have indels we have to take care of the exons boundaries
-      #pdb.set_trace()
+      firstReadSeq   = currentReadSeq[:spos]
+      secondReadSeq  = currentReadSeq[spos:]
 
-      import re
-      dna_del  = re.compile('\[-(?P<base>[acgt])\]')
-      read_del = re.compile('\[[acgt](?P<base>-)\]')
-      
-      elem_ctr = 0
+      est_fragment = []
       r_idx = 0
       while True:
          if not r_idx < len(currentReadSeq):
             break
 
-         if elem_ctr == spos:
+         if currentReadSeq[r_idx] == '[':
+            est_fragment.append(currentReadSeq[r_idx+2])
+            r_idx += 4
+            continue
+
+         est_fragment.append(currentReadSeq[r_idx])
+         r_idx += 1
+
+      dna_fragment_1 = []
+      r_idx = 0
+      while True:
+         if not r_idx < len(firstReadSeq):
             break
 
-         if currentReadSeq[r_idx] == '[':
+         if firstReadSeq[r_idx] == '[':
+            dna_fragment_1.append(firstReadSeq[r_idx+1])
             r_idx += 4
-            elem_ctr += 1
+            continue
 
-         elem_ctr += 1
+         dna_fragment_1.append(firstReadSeq[r_idx])
          r_idx += 1
 
-      firstReadSeq   = currentReadSeq[:r_idx]
-      secondReadSeq  = currentReadSeq[r_idx:]
+      dna_fragment_2 = []
+      r_idx = 0
+      while True:
+         if not r_idx < len(secondReadSeq):
+            break
+
+         if secondReadSeq[r_idx] == '[':
+            dna_fragment_2.append(secondReadSeq[r_idx+1])
+            r_idx += 4
+            continue
 
-      firstReadSeqPruned = re.sub(dna_del,'\g<base>',firstReadSeq)
-      firstReadSeqPruned = re.sub(read_del,'\g<base>',firstReadSeqPruned)
+         dna_fragment_2.append(secondReadSeq[r_idx])
+         r_idx += 1
 
-      secondReadSeqPruned = re.sub(dna_del,'\g<base>',secondReadSeq)
-      secondReadSeqPruned = re.sub(read_del,'\g<base>',secondReadSeqPruned)
+      currentReadSeq = "".join(est_fragment)
+      dna_fragment_1 = "".join(dna_fragment_1)
+      dna_fragment_2 = "".join(dna_fragment_2)
 
-      numDNADel   = len(dna_del.findall(firstReadSeq))
-      numReadDel  = len(read_del.findall(firstReadSeq))
+      dna_fragment_1_size = len([p for p in dna_fragment_1 if p != '-'])
+      dna_fragment_2_size = len([p for p in dna_fragment_2 if p != '-'])
 
-      new_exon1_size = len(firstReadSeqPruned) + numReadDel - numDNADel
+      # new check the fragment of the annotation
+      new_exon1_size = dna_fragment_1_size
       p_start     = exon_stop - new_exon1_size + 1
 
-      numDNADel   = len(dna_del.findall(secondReadSeq))
-      numReadDel  = len(read_del.findall(secondReadSeq))
+      dna_annot_1 = currentDNASeq[p_start:exon_stop+1]
 
-      new_exon2_size = len(secondReadSeqPruned) + numReadDel - numDNADel
-      p_stop      = exon_start + new_exon2_size - 1 
+      # new check the fragment of the annotation
+      new_exon2_size = dna_fragment_2_size 
+      p_stop      = exon_start + new_exon2_size - 1
 
-      currentReadSeq = firstReadSeqPruned+secondReadSeqPruned
-      assert len(currentReadSeq) == Conf.read_size, 'Error with glued read'
+      dna_annot_2 = currentDNASeq[exon_start:p_stop+1]
 
-      #pdb.set_trace()
+      #assert dna_fragment_1.replace('-','') == dna_annot_1, pdb.set_trace()
+      #assert dna_fragment_2.replace('-','') == dna_annot_2, pdb.set_trace()
+
+      if not dna_fragment_1.replace('-','') == dna_annot_1:
+         #print 'genomic seq mismatch'
+         #print 'orignal read:\t %s ' % originalReadSeq
+         #print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
+         #print 'dna_fragment_1:\t %s' % dna_fragment_1
+         #print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
+         return nil
+
+      if not dna_fragment_2.replace('-','') == dna_annot_2:
+         #print 'genomic seq mismatch'
+         #print 'orignal read:\t %s ' % originalReadSeq
+         #print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
+         #print 'dna_fragment_2:\t %s' % dna_fragment_2
+         #print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
+         return nil
+
+      #print 'successful'
 
       for elem in currentReadSeq:
          assert elem in alphabet, 'Invalid char (alphabet) in read: \"%s\"' % elem
 
-   assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
+   #assert p_start < exon_stop < exon_start < p_stop, pdb.set_trace()
    assert p_stop - p_start >= Conf.read_size
 
    currentExons = zeros((2,2),dtype=numpy.int)
@@ -274,17 +351,14 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    currentExons[1,0] = exon_start
    currentExons[1,1] = p_stop
 
-   # make indices relative to start pos of the current gene
-   currentExons -= currentGene.start
-
    # determine cutting positions for sequences
    up_cut   = int(currentExons[0,0])
    down_cut = int(currentExons[1,1])
 
-   # if we perform testing we cut a much wider region as we want to detect
+# if we perform testing we cut a much wider region as we want to detect
    # how good we perform on a region
    import random
-   ADD_POS = random.randint(250,1000)
+   ADD_POS = random.randint(Conf.extension[0],Conf.extension[1])
 
    if test:
       up_cut = up_cut-ADD_POS
@@ -301,6 +375,7 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
 
    # ensure that index is correct
    currentExons[0,1] += 1
+   currentExons[1,1] += 1
 
    if test:
       currentExons -= up_cut
@@ -318,7 +393,8 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
          return nil
 
    except:
-      pdb.set_trace()
+      return nil
+      #pdb.set_trace()
 
    # now we want to use interval_query to get the predicted splice scores
    # trained on the TAIR sequence and annotation
@@ -381,19 +457,30 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
             continue
          don[position-1] = scores[i]
    except:
-      pdb.set_trace()
+      return nil
+      #pdb.set_trace()
 
    don = [-inf] + don[:-1]
 
-   assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
-   assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
+   #assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
+   #assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
+   #assert (len(currentDNASeq)) == len(acc), pdb.set_trace()
+   #assert (len(currentDNASeq)) == len(don), pdb.set_trace()
 
-   assert (len(currentDNASeq)) == len(acc), pdb.set_trace()
-   assert (len(currentDNASeq)) == len(don), pdb.set_trace()
 
-   acc = [-inf] + acc[:-1]
+   if not ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf]:
+      return nil
 
-   currentExons[1,1] += 1
+   if not gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf]:
+      return nil
+
+   if not (len(currentDNASeq)) == len(acc):
+      return nil
+
+   if not (len(currentDNASeq)) == len(don):
+      return nil
+
+   acc = [-inf] + acc[:-1]
 
    return currentDNASeq, acc, don, currentExons, currentReadSeq, originalReadSeq, quality, spos