+ added basic scripts to generate runs for an experiment
[qpalma.git] / scripts / compile_dataset.py
index aec5fc5..cf79622 100644 (file)
@@ -19,6 +19,16 @@ from Genefinding import *
 
 from genome_utils import load_genomic
 
+import qpalma.Configuration as Conf
+
+warning = """
+   #########################################################
+   WARNING ! Disabled the use of non-bracket reads WARNING !
+   #########################################################
+   
+   """
+
+
 help = """
 
    Usage of this script is:
@@ -90,6 +100,7 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    Donors = []
    Exons = []
    Ests = []
+   OriginalEsts = []
    Qualities = []
    SplitPositions = []
    Ids = []
@@ -98,6 +109,8 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    
    # Iterate over all remapped reads in order to generate for each read an
    # training / prediction example
+   instance_counter = 0
+
    for id,currentFRead in all_filtered_reads.items():
 
       try:
@@ -105,14 +118,18 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
       except:
          currentRReads = None
 
-      gene_id = currentFRead['gene_id']
-      chr = currentFRead['chr']
-      currentGene = allGenes[chr][gene_id]
+      try:
+         gene_id = currentFRead['gene_id']
+         chro = currentFRead['chr']
+         currentGene = allGenes[chro][gene_id]
+      except:
+         pdb.set_trace()
 
       if currentFRead['strand'] != '+':
+         #print 'wrong strand'
          continue
-   
-      seq, acc, don, exons, est, qual, spos = process_read(currentRReads,currentFRead,currentGene,dna_flat_files,test)
+
+      seq, acc, don, exons, est, original_est, qual, spos = process_read(currentRReads,currentFRead,currentGene,dna_flat_files,test)
       
       if seq == '':
          continue
@@ -122,12 +139,23 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
       Donors.append(don)
       Exons.append(exons)
       Ests.append(est)
+      OriginalEsts.append(original_est)
       Qualities.append(qual)
       SplitPositions.append(spos)
       Ids.append(id)
 
+      instance_counter += 1
+
+      if instance_counter % 100 == 0:
+         print 'processed %d examples' % instance_counter
+
+      if instance_counter == 20000:
+         break
+
+   print 'Full dataset has size %d' % len(Sequences)
+
    dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
-   'Exons':Exons, 'Ests':Ests, 'Qualities':Qualities,\
+   'Exons':Exons, 'Ests':Ests, 'OriginalEsts':OriginalEsts, 'Qualities':Qualities,\
    'SplitPositions':SplitPositions,'Ids':Ids}
 
    # saving new dataset
@@ -140,39 +168,180 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    by using a 
 
    """
-   nil = ('','','','','','','')
+   nil = ('','','','','','','','')
+   extended_alphabet = ['-','a','c','g','t','n','[',']']
+   alphabet = ['-','a','c','g','t','n']
+
+   chr            = fRead['chr']
+   strand         = fRead['strand']
+   quality        = fRead['prb']
+   #quality        = fRead['cal_prb']
+   #quality        = fRead['chastity']
+   spos           = fRead['splitpos']
+   currentReadSeq = fRead['seq']
+   currentReadSeq = currentReadSeq.lower()
 
-   chr = fRead['chr']
-   strand = fRead['strand']
+   #if currentReadSeq.find('[') == -1:
+   #   print warning
+   #   return nil
+
+   originalReadSeq = fRead['seq']
 
    chrom             = 'chr%d' % chr
    sscore_filename   = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s' % (chr,strand)
 
    # use matlab-style functions to access dna sequence
-   currentSeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=True)
-   currentSeq = currentSeq.lower()
+   currentDNASeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=False)
+   currentDNASeq = currentDNASeq.lower()
+
+   assert currentDNASeq != '', 'load_genomic returned empty sequence!'
+
+   seq_has_indels = False
+   for elem in currentReadSeq:
+      #assert elem in extended_alphabet, 'Invalid char (extended_alphabet) in read: \"%s\"' % elem
+      if not elem in extended_alphabet:
+         return nil
 
-   assert currentSeq != '', 'load_genomic returned empty sequence!'
+      if elem ==  ']':
+         seq_has_indels = True
+
+   for elem in currentDNASeq:
+      #assert elem in alphabet, 'Invalid char in dna: \"%s\"' % elem
+      if not elem in alphabet:
+         return nil
 
    if strand == '-':
       try:
-         currentSeq = reverse_complement(currentSeq)
+         currentDNASeq = reverse_complement(currentDNASeq)
       except:
          print 'invalid char'
          return nil
 
-   p_start     = fRead['p_start']
+   currentDNASeq = 'a' + currentDNASeq
+
    exon_stop   = fRead['exon_stop']
    exon_start  = fRead['exon_start']
-   p_stop      = fRead['p_stop']
 
-   currentReadSeq = fRead['seq']
-   quality = fRead['prb']
-   spos = fRead['splitpos']
+   # assertions to check whether exon start/stop positions are next to donor
+   # resp. acceptor sites
+   exonEnd     = exon_stop - currentGene.start
+   exonBegin   = exon_start - currentGene.start 
 
-   assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
-   assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
-   assert p_stop - p_start >= 36
+   current_don = currentDNASeq[exonEnd+1:exonEnd+3]
+   current_acc = currentDNASeq[exonBegin-2:exonBegin]
+   if not current_don == 'gt' or current_don == 'gc':
+      return nil
+
+   if not current_acc == 'ag':
+      return nil
+
+   #assert current_don == 'gt' or current_don == 'gc', pdb.set_trace()
+   #assert current_acc == 'ag', pdb.set_trace()
+
+   # assertions for don/acc sites held now we recalculate offset
+   exon_stop   = exon_stop - currentGene.start
+   exon_start  = exon_start - currentGene.start 
+
+   if not seq_has_indels:
+      p_start     = fRead['p_start']
+      exon_stop   = fRead['exon_stop']
+      exon_start  = fRead['exon_start']
+      p_stop      = fRead['p_stop']
+
+      assert (exon_stop - p_start) + (p_stop - exon_start) + 2 == Conf.read_size, 'Invalid read size'
+
+   else:
+      # if we have indels we have to take care of the exons boundaries
+      firstReadSeq   = currentReadSeq[:spos]
+      secondReadSeq  = currentReadSeq[spos:]
+
+      est_fragment = []
+      r_idx = 0
+      while True:
+         if not r_idx < len(currentReadSeq):
+            break
+
+         if currentReadSeq[r_idx] == '[':
+            est_fragment.append(currentReadSeq[r_idx+2])
+            r_idx += 4
+            continue
+
+         est_fragment.append(currentReadSeq[r_idx])
+         r_idx += 1
+
+      dna_fragment_1 = []
+      r_idx = 0
+      while True:
+         if not r_idx < len(firstReadSeq):
+            break
+
+         if firstReadSeq[r_idx] == '[':
+            dna_fragment_1.append(firstReadSeq[r_idx+1])
+            r_idx += 4
+            continue
+
+         dna_fragment_1.append(firstReadSeq[r_idx])
+         r_idx += 1
+
+      dna_fragment_2 = []
+      r_idx = 0
+      while True:
+         if not r_idx < len(secondReadSeq):
+            break
+
+         if secondReadSeq[r_idx] == '[':
+            dna_fragment_2.append(secondReadSeq[r_idx+1])
+            r_idx += 4
+            continue
+
+         dna_fragment_2.append(secondReadSeq[r_idx])
+         r_idx += 1
+
+      currentReadSeq = "".join(est_fragment)
+      dna_fragment_1 = "".join(dna_fragment_1)
+      dna_fragment_2 = "".join(dna_fragment_2)
+
+      dna_fragment_1_size = len([p for p in dna_fragment_1 if p != '-'])
+      dna_fragment_2_size = len([p for p in dna_fragment_2 if p != '-'])
+
+      # new check the fragment of the annotation
+      new_exon1_size = dna_fragment_1_size
+      p_start     = exon_stop - new_exon1_size + 1
+
+      dna_annot_1 = currentDNASeq[p_start:exon_stop+1]
+
+      # new check the fragment of the annotation
+      new_exon2_size = dna_fragment_2_size 
+      p_stop      = exon_start + new_exon2_size - 1
+
+      dna_annot_2 = currentDNASeq[exon_start:p_stop+1]
+
+      #assert dna_fragment_1.replace('-','') == dna_annot_1, pdb.set_trace()
+      #assert dna_fragment_2.replace('-','') == dna_annot_2, pdb.set_trace()
+
+      if not dna_fragment_1.replace('-','') == dna_annot_1:
+         #print 'genomic seq mismatch'
+         #print 'orignal read:\t %s ' % originalReadSeq
+         #print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
+         #print 'dna_fragment_1:\t %s' % dna_fragment_1
+         #print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
+         return nil
+
+      if not dna_fragment_2.replace('-','') == dna_annot_2:
+         #print 'genomic seq mismatch'
+         #print 'orignal read:\t %s ' % originalReadSeq
+         #print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
+         #print 'dna_fragment_2:\t %s' % dna_fragment_2
+         #print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
+         return nil
+
+      #print 'successful'
+
+      for elem in currentReadSeq:
+         assert elem in alphabet, 'Invalid char (alphabet) in read: \"%s\"' % elem
+
+   #assert p_start < exon_stop < exon_start < p_stop, pdb.set_trace()
+   assert p_stop - p_start >= Conf.read_size
 
    currentExons = zeros((2,2),dtype=numpy.int)
 
@@ -182,17 +351,14 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    currentExons[1,0] = exon_start
    currentExons[1,1] = p_stop
 
-   # make indices relative to start pos of the current gene
-   currentExons -= currentGene.start
-
    # determine cutting positions for sequences
    up_cut   = int(currentExons[0,0])
    down_cut = int(currentExons[1,1])
 
-   # if we perform testing we cut a much wider region as we want to detect
+# if we perform testing we cut a much wider region as we want to detect
    # how good we perform on a region
    import random
-   ADD_POS = random.randint(1,20)
+   ADD_POS = random.randint(Conf.extension[0],Conf.extension[1])
 
    if test:
       up_cut = up_cut-ADD_POS
@@ -201,14 +367,15 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
 
       down_cut = down_cut+ADD_POS
 
-      if down_cut > len(currentSeq):
-         down_cut = len(currentSeq)
+      if down_cut > len(currentDNASeq):
+         down_cut = len(currentDNASeq)
 
    # cut out part of the sequence
-   currentSeq = currentSeq[up_cut:down_cut]
+   currentDNASeq = currentDNASeq[up_cut:down_cut]
 
    # ensure that index is correct
    currentExons[0,1] += 1
+   currentExons[1,1] += 1
 
    if test:
       currentExons -= up_cut
@@ -216,17 +383,18 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
       currentExons -= currentExons[0,0]
 
    try:
-      if not (currentSeq[int(currentExons[0,1])] == 'g' and\
-      currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
+      if not (currentDNASeq[int(currentExons[0,1])] == 'g' and\
+      currentDNASeq[int(currentExons[0,1])+1] in ['c','t' ]):
          print 'not aligned'
          return nil
 
-      if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
+      if not (currentDNASeq[int(currentExons[1,0])-1] == 'g' and currentDNASeq[int(currentExons[1,0])-2] == 'a'):
          print 'not aligned'
          return nil
 
    except:
-      pdb.set_trace()
+      return nil
+      #pdb.set_trace()
 
    # now we want to use interval_query to get the predicted splice scores
    # trained on the TAIR sequence and annotation
@@ -236,12 +404,12 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
    % (chr,strand)
 
-   ag_tuple_pos = [p for p,e in enumerate(currentSeq) if p>0 and p<len(currentSeq)-1 and currentSeq[p-1]=='a' and e=='g' ]
+   ag_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and currentDNASeq[p-1]=='a' and e=='g' ]
 
    # fetch acceptor scores
    gf = Genef()
    num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
-   assert num_hits <= len(currentSeq)
+   assert num_hits <= len(currentDNASeq)
    
    #print 'Acceptor hits: %d' % num_hits
    pos = createIntArrayFromList([0]*num_hits)
@@ -249,14 +417,14 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    scores = createDoubleArrayFromList([0]*num_hits*num_fields)
    gf.getResults(pos,indices,scores)
 
-   acc  = [-inf]*(len(currentSeq))
+   acc  = [-inf]*(len(currentDNASeq))
 
    for i in range(num_hits):
       position = pos[i] 
       position -= (currentGene.start+up_cut)
-      if position < 2 or position > len(currentSeq)-1:
+      if position < 2 or position > len(currentDNASeq)-1:
          continue
-      assert 0 <= position < len(currentSeq), 'position index wrong'
+      assert 0 <= position < len(currentDNASeq), 'position index wrong'
       acc[position] = scores[i]
 
    acc = acc[1:] + [-inf]
@@ -269,9 +437,9 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
 
    gf = Genef()
    num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
-   assert num_hits <= len(currentSeq)
+   assert num_hits <= len(currentDNASeq)
 
-   gt_tuple_pos = [p for p,e in enumerate(currentSeq) if p>0 and p<len(currentSeq)-1 and e=='g' and (currentSeq[p+1]=='t' or currentSeq[p+1]=='c')]
+   gt_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and e=='g' and (currentDNASeq[p+1]=='t' or currentDNASeq[p+1]=='c')]
 
    #print 'Donor hits: %d' % num_hits
    pos = createIntArrayFromList([0]*num_hits)
@@ -279,29 +447,42 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    scores = createDoubleArrayFromList([0]*num_hits*num_fields)
    gf.getResults(pos,indices,scores)
 
-   don     = [-inf]*(len(currentSeq))
+   don     = [-inf]*(len(currentDNASeq))
 
    try:
       for i in range(num_hits):
          position = pos[i] 
          position -= (currentGene.start+up_cut)
-         if position < 1 or position >= len(currentSeq)-1:
+         if position < 1 or position >= len(currentDNASeq)-1:
             continue
          don[position-1] = scores[i]
    except:
-      pdb.set_trace()
+      return nil
+      #pdb.set_trace()
 
    don = [-inf] + don[:-1]
 
-   assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
-   assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
+   #assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
+   #assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
+   #assert (len(currentDNASeq)) == len(acc), pdb.set_trace()
+   #assert (len(currentDNASeq)) == len(don), pdb.set_trace()
+
+
+   if not ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf]:
+      return nil
+
+   if not gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf]:
+      return nil
+
+   if not (len(currentDNASeq)) == len(acc):
+      return nil
 
-   assert (len(currentSeq)) == len(acc), pdb.set_trace()
-   assert (len(currentSeq)) == len(don), pdb.set_trace()
+   if not (len(currentDNASeq)) == len(don):
+      return nil
 
    acc = [-inf] + acc[:-1]
 
-   return currentSeq, acc, don, currentExons, currentReadSeq, quality, spos
+   return currentDNASeq, acc, don, currentExons, currentReadSeq, originalReadSeq, quality, spos
 
 
 def reverse_complement(seq):