+ modified dataset generation in order to recalculate exons boundaries
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Sat, 9 Feb 2008 14:53:16 +0000 (14:53 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Sat, 9 Feb 2008 14:53:16 +0000 (14:53 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7754 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/compile_dataset.py

index 4479d8b..0250742 100644 (file)
@@ -19,6 +19,8 @@ from Genefinding import *
 
 from genome_utils import load_genomic
 
+import qpalma.Configuration as Conf
+
 help = """
 
    Usage of this script is:
@@ -90,6 +92,7 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    Donors = []
    Exons = []
    Ests = []
+   OriginalEsts = []
    Qualities = []
    SplitPositions = []
    Ids = []
@@ -116,8 +119,8 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
 
       if currentFRead['strand'] != '+':
          continue
-   
-      seq, acc, don, exons, est, qual, spos = process_read(currentRReads,currentFRead,currentGene,dna_flat_files,test)
+
+      seq, acc, don, exons, est, original_est, qual, spos = process_read(currentRReads,currentFRead,currentGene,dna_flat_files,test)
       
       if seq == '':
          continue
@@ -127,6 +130,7 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
       Donors.append(don)
       Exons.append(exons)
       Ests.append(est)
+      OriginalEsts.append(original_est)
       Qualities.append(qual)
       SplitPositions.append(spos)
       Ids.append(id)
@@ -139,6 +143,9 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
       if instance_counter == 1000:
          break
 
+
+   print 'Full dataset has size %d' % len(Sequences)
+
    dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
    'Exons':Exons, 'Ests':Ests, 'Qualities':Qualities,\
    'SplitPositions':SplitPositions,'Ids':Ids}
@@ -154,38 +161,110 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
 
    """
    nil = ('','','','','','','')
+   extended_alphabet = ['-','a','c','g','t','n','[',']']
+   alphabet = ['-','a','c','g','t','n']
+
+   chr            = fRead['chr']
+   strand         = fRead['strand']
+   quality        = fRead['prb']
+   spos           = fRead['splitpos']
+   currentReadSeq = fRead['seq']
+   currentReadSeq = currentReadSeq.lower()
 
-   chr = fRead['chr']
-   strand = fRead['strand']
+   originalReadSeq = fRead['seq']
 
    chrom             = 'chr%d' % chr
    sscore_filename   = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s' % (chr,strand)
 
    # use matlab-style functions to access dna sequence
-   currentSeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=True)
-   currentSeq = currentSeq.lower()
+   currentDNASeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=True)
+   currentDNASeq = currentDNASeq.lower()
+
+   assert currentDNASeq != '', 'load_genomic returned empty sequence!'
 
-   assert currentSeq != '', 'load_genomic returned empty sequence!'
+   seq_has_indels = False
+   for elem in currentReadSeq:
+      assert elem in extended_alphabet, 'Invalid char (extended_alphabet) in read: \"%s\"' % elem
+      if elem ==  ']':
+         seq_has_indels = True
+
+   for elem in currentDNASeq:
+      assert elem in alphabet, 'Invalid char in dna: \"%s\"' % elem
 
    if strand == '-':
       try:
-         currentSeq = reverse_complement(currentSeq)
+         currentDNASeq = reverse_complement(currentDNASeq)
       except:
          print 'invalid char'
          return nil
 
-   p_start     = fRead['p_start']
    exon_stop   = fRead['exon_stop']
    exon_start  = fRead['exon_start']
-   p_stop      = fRead['p_stop']
 
-   currentReadSeq = fRead['seq']
-   quality = fRead['prb']
-   spos = fRead['splitpos']
+
+   if not seq_has_indels:
+      p_start     = fRead['p_start']
+      exon_stop   = fRead['exon_stop']
+      exon_start  = fRead['exon_start']
+      p_stop      = fRead['p_stop']
+
+      assert (exon_stop - p_start) + (p_stop - exon_start) + 2 == Conf.read_size, 'Invalid read size'
+
+   else:
+      # if we have indels we have to take care of the exons boundaries
+      #pdb.set_trace()
+
+      import re
+      dna_del  = re.compile('\[-(?P<base>[acgt])\]')
+      read_del = re.compile('\[[acgt](?P<base>-)\]')
+      
+      elem_ctr = 0
+      r_idx = 0
+      while True:
+         if not r_idx < len(currentReadSeq):
+            break
+
+         if elem_ctr == spos:
+            break
+
+         if currentReadSeq[r_idx] == '[':
+            r_idx += 4
+            elem_ctr += 1
+
+         elem_ctr += 1
+         r_idx += 1
+
+      firstReadSeq   = currentReadSeq[:r_idx]
+      secondReadSeq  = currentReadSeq[r_idx:]
+
+      firstReadSeqPruned = re.sub(dna_del,'\g<base>',firstReadSeq)
+      firstReadSeqPruned = re.sub(read_del,'\g<base>',firstReadSeqPruned)
+
+      secondReadSeqPruned = re.sub(dna_del,'\g<base>',secondReadSeq)
+      secondReadSeqPruned = re.sub(read_del,'\g<base>',secondReadSeqPruned)
+
+      numDNADel   = len(dna_del.findall(firstReadSeq))
+      numReadDel  = len(read_del.findall(firstReadSeq))
+
+      new_exon1_size = len(firstReadSeqPruned) + numReadDel - numDNADel
+      p_start     = exon_stop - new_exon1_size + 1
+
+      numDNADel   = len(dna_del.findall(secondReadSeq))
+      numReadDel  = len(read_del.findall(secondReadSeq))
+
+      new_exon2_size = len(secondReadSeqPruned) + numReadDel - numDNADel
+      p_stop      = exon_start + new_exon2_size - 1 
+
+      currentReadSeq = firstReadSeqPruned+secondReadSeqPruned
+      assert len(currentReadSeq) == Conf.read_size, 'Error with glued read'
+
+      #pdb.set_trace()
+
+      for elem in currentReadSeq:
+         assert elem in alphabet, 'Invalid char (alphabet) in read: \"%s\"' % elem
 
    assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
-   assert (exon_stop - p_start) + (p_stop - exon_start) + 2 == 36, 'Invalid read size'
-   assert p_stop - p_start >= 36
+   assert p_stop - p_start >= Conf.read_size
 
    currentExons = zeros((2,2),dtype=numpy.int)
 
@@ -205,7 +284,7 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    # if we perform testing we cut a much wider region as we want to detect
    # how good we perform on a region
    import random
-   ADD_POS = random.randint(1,1000)
+   ADD_POS = random.randint(250,1000)
 
    if test:
       up_cut = up_cut-ADD_POS
@@ -214,11 +293,11 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
 
       down_cut = down_cut+ADD_POS
 
-      if down_cut > len(currentSeq):
-         down_cut = len(currentSeq)
+      if down_cut > len(currentDNASeq):
+         down_cut = len(currentDNASeq)
 
    # cut out part of the sequence
-   currentSeq = currentSeq[up_cut:down_cut]
+   currentDNASeq = currentDNASeq[up_cut:down_cut]
 
    # ensure that index is correct
    currentExons[0,1] += 1
@@ -229,12 +308,12 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
       currentExons -= currentExons[0,0]
 
    try:
-      if not (currentSeq[int(currentExons[0,1])] == 'g' and\
-      currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
+      if not (currentDNASeq[int(currentExons[0,1])] == 'g' and\
+      currentDNASeq[int(currentExons[0,1])+1] in ['c','t' ]):
          print 'not aligned'
          return nil
 
-      if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
+      if not (currentDNASeq[int(currentExons[1,0])-1] == 'g' and currentDNASeq[int(currentExons[1,0])-2] == 'a'):
          print 'not aligned'
          return nil
 
@@ -249,12 +328,12 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
    % (chr,strand)
 
-   ag_tuple_pos = [p for p,e in enumerate(currentSeq) if p>0 and p<len(currentSeq)-1 and currentSeq[p-1]=='a' and e=='g' ]
+   ag_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and currentDNASeq[p-1]=='a' and e=='g' ]
 
    # fetch acceptor scores
    gf = Genef()
    num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
-   assert num_hits <= len(currentSeq)
+   assert num_hits <= len(currentDNASeq)
    
    #print 'Acceptor hits: %d' % num_hits
    pos = createIntArrayFromList([0]*num_hits)
@@ -262,14 +341,14 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    scores = createDoubleArrayFromList([0]*num_hits*num_fields)
    gf.getResults(pos,indices,scores)
 
-   acc  = [-inf]*(len(currentSeq))
+   acc  = [-inf]*(len(currentDNASeq))
 
    for i in range(num_hits):
       position = pos[i] 
       position -= (currentGene.start+up_cut)
-      if position < 2 or position > len(currentSeq)-1:
+      if position < 2 or position > len(currentDNASeq)-1:
          continue
-      assert 0 <= position < len(currentSeq), 'position index wrong'
+      assert 0 <= position < len(currentDNASeq), 'position index wrong'
       acc[position] = scores[i]
 
    acc = acc[1:] + [-inf]
@@ -282,9 +361,9 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
 
    gf = Genef()
    num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
-   assert num_hits <= len(currentSeq)
+   assert num_hits <= len(currentDNASeq)
 
-   gt_tuple_pos = [p for p,e in enumerate(currentSeq) if p>0 and p<len(currentSeq)-1 and e=='g' and (currentSeq[p+1]=='t' or currentSeq[p+1]=='c')]
+   gt_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and e=='g' and (currentDNASeq[p+1]=='t' or currentDNASeq[p+1]=='c')]
 
    #print 'Donor hits: %d' % num_hits
    pos = createIntArrayFromList([0]*num_hits)
@@ -292,13 +371,13 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    scores = createDoubleArrayFromList([0]*num_hits*num_fields)
    gf.getResults(pos,indices,scores)
 
-   don     = [-inf]*(len(currentSeq))
+   don     = [-inf]*(len(currentDNASeq))
 
    try:
       for i in range(num_hits):
          position = pos[i] 
          position -= (currentGene.start+up_cut)
-         if position < 1 or position >= len(currentSeq)-1:
+         if position < 1 or position >= len(currentDNASeq)-1:
             continue
          don[position-1] = scores[i]
    except:
@@ -309,14 +388,14 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
    assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
 
-   assert (len(currentSeq)) == len(acc), pdb.set_trace()
-   assert (len(currentSeq)) == len(don), pdb.set_trace()
+   assert (len(currentDNASeq)) == len(acc), pdb.set_trace()
+   assert (len(currentDNASeq)) == len(don), pdb.set_trace()
 
    acc = [-inf] + acc[:-1]
 
    currentExons[1,1] += 1
 
-   return currentSeq, acc, don, currentExons, currentReadSeq, quality, spos
+   return currentDNASeq, acc, don, currentExons, currentReadSeq, originalReadSeq, quality, spos
 
 
 def reverse_complement(seq):