+ added basic scripts to generate runs for an experiment
[qpalma.git] / scripts / compile_dataset.py
index 51c5d9d..cf79622 100644 (file)
@@ -2,14 +2,33 @@
 # -*- coding: utf-8 -*-
 
 import sys
+import os
 import pdb
 import io_pickle
 
+import numpy
+from numpy.matlib import mat,zeros,ones,inf
+
+import qpalma
 import qpalma.tools
 from qpalma.tools.parseGff import parse_gff
 
 from qpalma.parsers import FilteredReadParser, RemappedReadParser
 
+from Genefinding import *
+
+from genome_utils import load_genomic
+
+import qpalma.Configuration as Conf
+
+warning = """
+   #########################################################
+   WARNING ! Disabled the use of non-bracket reads WARNING !
+   #########################################################
+   
+   """
+
+
 help = """
 
    Usage of this script is:
@@ -45,13 +64,9 @@ info = """
    - 'remappped_pos' - the position of the remapped read
    """
 
-def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file):
+def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
 
-   assert os.path.exists(gff_file)
-   for file in dna_flat_files:
-      assert os.path.exists(file)
-
-   assert os.path.exists(solexa_reads)
+   assert os.path.exists(filtered_reads)
    assert os.path.exists(remapped_reads)
 
    assert not os.path.exists(dataset_file), 'The data_file already exists!'
@@ -59,16 +74,25 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    joinp = os.path.join
 
    # first read the gff file(s) and create pickled gene information
-   allGenes = parse_gff(gff_file,joinp(tmp_dir,'gff_info.pickle'))
-   
-   dna_filename = Conf.dna_filename
-   est_filename = Conf.est_filename
+   allGenes = [None]*6
 
+   for i in range(1,6):
+      allGenes[i]= parse_gff(gff_file%i) #,joinp(tmp_dir,'gff_info.pickle'))
+
+   print 'parsing filtered reads..'
    frp = FilteredReadParser(filtered_reads)
    all_filtered_reads = frp.parse()
+   print 'found %d filtered reads' % len(all_filtered_reads)
 
+   print 'parsing remapped reads'
    rrp = RemappedReadParser(remapped_reads)
    all_remapped_reads = rrp.parse()
+   if all_remapped_reads != None:
+      size_rem = len(all_remapped_reads)
+   else:
+      size_rem = 0
+
+   print 'found %d remapped reads' % size_rem
 
    # this stores the new dataset
    Sequences = []
@@ -76,23 +100,38 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    Donors = []
    Exons = []
    Ests = []
+   OriginalEsts = []
    Qualities = []
    SplitPositions = []
+   Ids = []
+
+   #pdb.set_trace()
    
    # Iterate over all remapped reads in order to generate for each read an
    # training / prediction example
-   for reRead in all_remapped_reads:
-      currentId = reRead['id']
-      fRead = all_filtered_reads[currentId]
-      currentGene = allGenes[gene_id]
+   instance_counter = 0
+
+   for id,currentFRead in all_filtered_reads.items():
 
-      chrom             = 'chr1'
-      genome_file       = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-      sscore_filename   = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+' 
+      try:
+         currentRReads = all_remapped_reads[id]
+      except:
+         currentRReads = None
 
-      seq, acc, don, exons, est, qual, spos = process_read(reRead,fRead,currentGene,chrom,genome_file,sscore_filename)
+      try:
+         gene_id = currentFRead['gene_id']
+         chro = currentFRead['chr']
+         currentGene = allGenes[chro][gene_id]
+      except:
+         pdb.set_trace()
+
+      if currentFRead['strand'] != '+':
+         #print 'wrong strand'
+         continue
+
+      seq, acc, don, exons, est, original_est, qual, spos = process_read(currentRReads,currentFRead,currentGene,dna_flat_files,test)
       
-      if newExample == None:
+      if seq == '':
          continue
 
       Sequences.append(seq)
@@ -100,43 +139,211 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
       Donors.append(don)
       Exons.append(exons)
       Ests.append(est)
+      OriginalEsts.append(original_est)
       Qualities.append(qual)
       SplitPositions.append(spos)
+      Ids.append(id)
+
+      instance_counter += 1
+
+      if instance_counter % 100 == 0:
+         print 'processed %d examples' % instance_counter
 
+      if instance_counter == 20000:
+         break
+
+   print 'Full dataset has size %d' % len(Sequences)
+
+   dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
+   'Exons':Exons, 'Ests':Ests, 'OriginalEsts':OriginalEsts, 'Qualities':Qualities,\
+   'SplitPositions':SplitPositions,'Ids':Ids}
 
-   dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors\
-   'Exons':Exons, 'Ests':Ests, 'Qualities':Qualities, 'SplitPositions':SplitPositions}
    # saving new dataset
    io_pickle.save(dataset_file,dataset)
 
 
-def process_read(reRead,fRead,currentGene,chrom,genome_file,sscore_filename):
+def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    """
    The purpose of this function is to create an example for QPalma 
    by using a 
 
    """
+   nil = ('','','','','','','','')
+   extended_alphabet = ['-','a','c','g','t','n','[',']']
+   alphabet = ['-','a','c','g','t','n']
+
+   chr            = fRead['chr']
+   strand         = fRead['strand']
+   quality        = fRead['prb']
+   #quality        = fRead['cal_prb']
+   #quality        = fRead['chastity']
+   spos           = fRead['splitpos']
+   currentReadSeq = fRead['seq']
+   currentReadSeq = currentReadSeq.lower()
+
+   #if currentReadSeq.find('[') == -1:
+   #   print warning
+   #   return nil
+
+   originalReadSeq = fRead['seq']
+
+   chrom             = 'chr%d' % chr
+   sscore_filename   = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s' % (chr,strand)
 
    # use matlab-style functions to access dna sequence
-   dna = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
-   dna = dna.lower()
-   currentSeq = dna
+   currentDNASeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=False)
+   currentDNASeq = currentDNASeq.lower()
+
+   assert currentDNASeq != '', 'load_genomic returned empty sequence!'
+
+   seq_has_indels = False
+   for elem in currentReadSeq:
+      #assert elem in extended_alphabet, 'Invalid char (extended_alphabet) in read: \"%s\"' % elem
+      if not elem in extended_alphabet:
+         return nil
+
+      if elem ==  ']':
+         seq_has_indels = True
+
+   for elem in currentDNASeq:
+      #assert elem in alphabet, 'Invalid char in dna: \"%s\"' % elem
+      if not elem in alphabet:
+         return nil
+
+   if strand == '-':
+      try:
+         currentDNASeq = reverse_complement(currentDNASeq)
+      except:
+         print 'invalid char'
+         return nil
+
+   currentDNASeq = 'a' + currentDNASeq
+
+   exon_stop   = fRead['exon_stop']
+   exon_start  = fRead['exon_start']
+
+   # assertions to check whether exon start/stop positions are next to donor
+   # resp. acceptor sites
+   exonEnd     = exon_stop - currentGene.start
+   exonBegin   = exon_start - currentGene.start 
+
+   current_don = currentDNASeq[exonEnd+1:exonEnd+3]
+   current_acc = currentDNASeq[exonBegin-2:exonBegin]
+   if not current_don == 'gt' or current_don == 'gc':
+      return nil
+
+   if not current_acc == 'ag':
+      return nil
+
+   #assert current_don == 'gt' or current_don == 'gc', pdb.set_trace()
+   #assert current_acc == 'ag', pdb.set_trace()
+
+   # assertions for don/acc sites held now we recalculate offset
+   exon_stop   = exon_stop - currentGene.start
+   exon_start  = exon_start - currentGene.start 
+
+   if not seq_has_indels:
+      p_start     = fRead['p_start']
+      exon_stop   = fRead['exon_stop']
+      exon_start  = fRead['exon_start']
+      p_stop      = fRead['p_stop']
+
+      assert (exon_stop - p_start) + (p_stop - exon_start) + 2 == Conf.read_size, 'Invalid read size'
 
-   if len(currentSeq) < (currentGene.stop - currentGene.start):
-      return None
+   else:
+      # if we have indels we have to take care of the exons boundaries
+      firstReadSeq   = currentReadSeq[:spos]
+      secondReadSeq  = currentReadSeq[spos:]
+
+      est_fragment = []
+      r_idx = 0
+      while True:
+         if not r_idx < len(currentReadSeq):
+            break
+
+         if currentReadSeq[r_idx] == '[':
+            est_fragment.append(currentReadSeq[r_idx+2])
+            r_idx += 4
+            continue
+
+         est_fragment.append(currentReadSeq[r_idx])
+         r_idx += 1
+
+      dna_fragment_1 = []
+      r_idx = 0
+      while True:
+         if not r_idx < len(firstReadSeq):
+            break
+
+         if firstReadSeq[r_idx] == '[':
+            dna_fragment_1.append(firstReadSeq[r_idx+1])
+            r_idx += 4
+            continue
+
+         dna_fragment_1.append(firstReadSeq[r_idx])
+         r_idx += 1
+
+      dna_fragment_2 = []
+      r_idx = 0
+      while True:
+         if not r_idx < len(secondReadSeq):
+            break
+
+         if secondReadSeq[r_idx] == '[':
+            dna_fragment_2.append(secondReadSeq[r_idx+1])
+            r_idx += 4
+            continue
+
+         dna_fragment_2.append(secondReadSeq[r_idx])
+         r_idx += 1
 
-   p_start = fRead['p_start']
-   exon_stop = fRead['exon_stop']
-   exon_start = fRead['exon_start']
-   p_stop = fRead['p_stop']
+      currentReadSeq = "".join(est_fragment)
+      dna_fragment_1 = "".join(dna_fragment_1)
+      dna_fragment_2 = "".join(dna_fragment_2)
 
-   currentSeq = fRead['seq']
+      dna_fragment_1_size = len([p for p in dna_fragment_1 if p != '-'])
+      dna_fragment_2_size = len([p for p in dna_fragment_2 if p != '-'])
 
-   assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
-   assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
-   assert p_stop - p_start >= 36
+      # new check the fragment of the annotation
+      new_exon1_size = dna_fragment_1_size
+      p_start     = exon_stop - new_exon1_size + 1
 
-   currentExons = zeros((2,2))
+      dna_annot_1 = currentDNASeq[p_start:exon_stop+1]
+
+      # new check the fragment of the annotation
+      new_exon2_size = dna_fragment_2_size 
+      p_stop      = exon_start + new_exon2_size - 1
+
+      dna_annot_2 = currentDNASeq[exon_start:p_stop+1]
+
+      #assert dna_fragment_1.replace('-','') == dna_annot_1, pdb.set_trace()
+      #assert dna_fragment_2.replace('-','') == dna_annot_2, pdb.set_trace()
+
+      if not dna_fragment_1.replace('-','') == dna_annot_1:
+         #print 'genomic seq mismatch'
+         #print 'orignal read:\t %s ' % originalReadSeq
+         #print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
+         #print 'dna_fragment_1:\t %s' % dna_fragment_1
+         #print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
+         return nil
+
+      if not dna_fragment_2.replace('-','') == dna_annot_2:
+         #print 'genomic seq mismatch'
+         #print 'orignal read:\t %s ' % originalReadSeq
+         #print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
+         #print 'dna_fragment_2:\t %s' % dna_fragment_2
+         #print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
+         return nil
+
+      #print 'successful'
+
+      for elem in currentReadSeq:
+         assert elem in alphabet, 'Invalid char (alphabet) in read: \"%s\"' % elem
+
+   #assert p_start < exon_stop < exon_start < p_stop, pdb.set_trace()
+   assert p_stop - p_start >= Conf.read_size
+
+   currentExons = zeros((2,2),dtype=numpy.int)
 
    currentExons[0,0] = p_start
    currentExons[0,1] = exon_stop
@@ -144,30 +351,31 @@ def process_read(reRead,fRead,currentGene,chrom,genome_file,sscore_filename):
    currentExons[1,0] = exon_start
    currentExons[1,1] = p_stop
 
-   # make indices relative to start pos of the current gene
-   currentExons -= currentGene.start
-
    # determine cutting positions for sequences
    up_cut   = int(currentExons[0,0])
    down_cut = int(currentExons[1,1])
 
-   # if we perform testing we cut a much wider region as we want to detect
+# if we perform testing we cut a much wider region as we want to detect
    # how good we perform on a region
+   import random
+   ADD_POS = random.randint(Conf.extension[0],Conf.extension[1])
+
    if test:
-      up_cut = up_cut-500
+      up_cut = up_cut-ADD_POS
       if up_cut < 0:
          up_cut = 0
 
-      down_cut = down_cut+500
+      down_cut = down_cut+ADD_POS
 
-      if down_cut > len(currentSeq):
-         down_cut = len(currentSeq)
+      if down_cut > len(currentDNASeq):
+         down_cut = len(currentDNASeq)
 
    # cut out part of the sequence
-   currentSeq = currentSeq[up_cut:down_cut]
+   currentDNASeq = currentDNASeq[up_cut:down_cut]
 
    # ensure that index is correct
    currentExons[0,1] += 1
+   currentExons[1,1] += 1
 
    if test:
       currentExons -= up_cut
@@ -175,53 +383,63 @@ def process_read(reRead,fRead,currentGene,chrom,genome_file,sscore_filename):
       currentExons -= currentExons[0,0]
 
    try:
-      if not (currentSeq[int(currentExons[0,1])] == 'g' and\
-      currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
-         continue
+      if not (currentDNASeq[int(currentExons[0,1])] == 'g' and\
+      currentDNASeq[int(currentExons[0,1])+1] in ['c','t' ]):
+         print 'not aligned'
+         return nil
 
-      if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
-         continue
+      if not (currentDNASeq[int(currentExons[1,0])-1] == 'g' and currentDNASeq[int(currentExons[1,0])-2] == 'a'):
+         print 'not aligned'
+         return nil
 
    except:
-      pdb.set_trace()
+      return nil
+      #pdb.set_trace()
 
    # now we want to use interval_query to get the predicted splice scores
    # trained on the TAIR sequence and annotation
-   from Genefinding import *
-   
-   interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
+   interval_matrix = createIntArrayFromList([currentGene.start+up_cut-100,currentGene.start+down_cut+100])
    num_fields = 1
    num_intervals = 1
+   sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
+   % (chr,strand)
+
+   ag_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and currentDNASeq[p-1]=='a' and e=='g' ]
 
    # fetch acceptor scores
    gf = Genef()
    num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
-   assert num_hits <= len(currentSeq)
+   assert num_hits <= len(currentDNASeq)
    
    #print 'Acceptor hits: %d' % num_hits
-
    pos = createIntArrayFromList([0]*num_hits)
    indices = createIntArrayFromList([0]*num_hits)
    scores = createDoubleArrayFromList([0]*num_hits*num_fields)
    gf.getResults(pos,indices,scores)
 
-   acc  = [-inf]*(len(currentSeq)+1)
-   
+   acc  = [-inf]*(len(currentDNASeq))
+
    for i in range(num_hits):
       position = pos[i] 
       position -= (currentGene.start+up_cut)
-      assert 0 <= position < len(currentSeq)+1, 'position index wrong'
+      if position < 2 or position > len(currentDNASeq)-1:
+         continue
+      assert 0 <= position < len(currentDNASeq), 'position index wrong'
       acc[position] = scores[i]
 
    acc = acc[1:] + [-inf]
-   Acceptors.append(acc)
 
    del gf
 
    # fetch donor scores
+   sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s'\
+   % (chr,strand)
+
    gf = Genef()
    num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
-   assert num_hits <= len(currentSeq)
+   assert num_hits <= len(currentDNASeq)
+
+   gt_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and e=='g' and (currentDNASeq[p+1]=='t' or currentDNASeq[p+1]=='c')]
 
    #print 'Donor hits: %d' % num_hits
    pos = createIntArrayFromList([0]*num_hits)
@@ -229,34 +447,56 @@ def process_read(reRead,fRead,currentGene,chrom,genome_file,sscore_filename):
    scores = createDoubleArrayFromList([0]*num_hits*num_fields)
    gf.getResults(pos,indices,scores)
 
-   don     = [-inf]*(len(currentSeq))
+   don     = [-inf]*(len(currentDNASeq))
 
    try:
-      for i in range(1,num_hits):
+      for i in range(num_hits):
          position = pos[i] 
          position -= (currentGene.start+up_cut)
+         if position < 1 or position >= len(currentDNASeq)-1:
+            continue
          don[position-1] = scores[i]
    except:
-      pdb.set_trace()
+      return nil
+      #pdb.set_trace()
+
+   don = [-inf] + don[:-1]
+
+   #assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
+   #assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
+   #assert (len(currentDNASeq)) == len(acc), pdb.set_trace()
+   #assert (len(currentDNASeq)) == len(don), pdb.set_trace()
+
+
+   if not ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf]:
+      return nil
 
-   don = [-inf] + currentDonors[:-1]
+   if not gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf]:
+      return nil
 
-   return seq, acc, don, currentExons, est, qual, spos
+   if not (len(currentDNASeq)) == len(acc):
+      return nil
+
+   if not (len(currentDNASeq)) == len(don):
+      return nil
+
+   acc = [-inf] + acc[:-1]
+
+   return currentDNASeq, acc, don, currentExons, currentReadSeq, originalReadSeq, quality, spos
 
 
 def reverse_complement(seq):
-   map {'a':'t','c':'g','g':'c','t':'a'}
+   map {'a':'t','c':'g','g':'c','t':'a'}
 
    new_seq = [map[elem] for elem in seq]
    new_seq.reverse()
+   new_seq = "".join(new_seq)
 
    return new_seq
    
-
 if __name__ == '__main__':
    if len(sys.argv) == 1:
       print info
 
    assert len(sys.argv) == 6, help
-
-   compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file):
+   compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)