+ added basic scripts to generate runs for an experiment
[qpalma.git] / scripts / compile_dataset.py
index 30b651c..cf79622 100644 (file)
@@ -19,6 +19,16 @@ from Genefinding import *
 
 from genome_utils import load_genomic
 
+import qpalma.Configuration as Conf
+
+warning = """
+   #########################################################
+   WARNING ! Disabled the use of non-bracket reads WARNING !
+   #########################################################
+   
+   """
+
+
 help = """
 
    Usage of this script is:
@@ -54,11 +64,7 @@ info = """
    - 'remappped_pos' - the position of the remapped read
    """
 
-def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file):
-
-   assert os.path.exists(gff_file)
-   #for file in dna_flat_files:
-   #   assert os.path.exists(file)
+def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
 
    assert os.path.exists(filtered_reads)
    assert os.path.exists(remapped_reads)
@@ -68,16 +74,25 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    joinp = os.path.join
 
    # first read the gff file(s) and create pickled gene information
-   allGenes = parse_gff(gff_file) #,joinp(tmp_dir,'gff_info.pickle'))
-   
-   print 'parsing filtered reads'
-   frp = FilteredReadParser(filtered_reads)
+   allGenes = [None]*6
 
-   print 'parsing remapped reads'
+   for i in range(1,6):
+      allGenes[i]= parse_gff(gff_file%i) #,joinp(tmp_dir,'gff_info.pickle'))
+
+   print 'parsing filtered reads..'
+   frp = FilteredReadParser(filtered_reads)
    all_filtered_reads = frp.parse()
+   print 'found %d filtered reads' % len(all_filtered_reads)
 
+   print 'parsing remapped reads'
    rrp = RemappedReadParser(remapped_reads)
    all_remapped_reads = rrp.parse()
+   if all_remapped_reads != None:
+      size_rem = len(all_remapped_reads)
+   else:
+      size_rem = 0
+
+   print 'found %d remapped reads' % size_rem
 
    # this stores the new dataset
    Sequences = []
@@ -85,12 +100,17 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    Donors = []
    Exons = []
    Ests = []
+   OriginalEsts = []
    Qualities = []
    SplitPositions = []
    Ids = []
+
+   #pdb.set_trace()
    
    # Iterate over all remapped reads in order to generate for each read an
    # training / prediction example
+   instance_counter = 0
+
    for id,currentFRead in all_filtered_reads.items():
 
       try:
@@ -98,14 +118,18 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
       except:
          currentRReads = None
 
-      gene_id = currentFRead['gene_id']
-      currentGene = allGenes[gene_id]
+      try:
+         gene_id = currentFRead['gene_id']
+         chro = currentFRead['chr']
+         currentGene = allGenes[chro][gene_id]
+      except:
+         pdb.set_trace()
 
-      chrom             = 'chr1'
-      genome_file       = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-      sscore_filename   = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+' 
+      if currentFRead['strand'] != '+':
+         #print 'wrong strand'
+         continue
 
-      seq, acc, don, exons, est, qual, spos = process_read(currentRReads,currentFRead,currentGene,chrom,genome_file,sscore_filename)
+      seq, acc, don, exons, est, original_est, qual, spos = process_read(currentRReads,currentFRead,currentGene,dna_flat_files,test)
       
       if seq == '':
          continue
@@ -115,45 +139,209 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
       Donors.append(don)
       Exons.append(exons)
       Ests.append(est)
+      OriginalEsts.append(original_est)
       Qualities.append(qual)
       SplitPositions.append(spos)
       Ids.append(id)
 
+      instance_counter += 1
+
+      if instance_counter % 100 == 0:
+         print 'processed %d examples' % instance_counter
+
+      if instance_counter == 20000:
+         break
+
+   print 'Full dataset has size %d' % len(Sequences)
+
    dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
-   'Exons':Exons, 'Ests':Ests, 'Qualities':Qualities,\
+   'Exons':Exons, 'Ests':Ests, 'OriginalEsts':OriginalEsts, 'Qualities':Qualities,\
    'SplitPositions':SplitPositions,'Ids':Ids}
 
    # saving new dataset
    io_pickle.save(dataset_file,dataset)
 
 
-def process_read(reReads,fRead,currentGene,chrom,genome_file,sscore_filename):
+def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    """
    The purpose of this function is to create an example for QPalma 
    by using a 
 
    """
+   nil = ('','','','','','','','')
+   extended_alphabet = ['-','a','c','g','t','n','[',']']
+   alphabet = ['-','a','c','g','t','n']
+
+   chr            = fRead['chr']
+   strand         = fRead['strand']
+   quality        = fRead['prb']
+   #quality        = fRead['cal_prb']
+   #quality        = fRead['chastity']
+   spos           = fRead['splitpos']
+   currentReadSeq = fRead['seq']
+   currentReadSeq = currentReadSeq.lower()
+
+   #if currentReadSeq.find('[') == -1:
+   #   print warning
+   #   return nil
+
+   originalReadSeq = fRead['seq']
+
+   chrom             = 'chr%d' % chr
+   sscore_filename   = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s' % (chr,strand)
+
    # use matlab-style functions to access dna sequence
-   currentSeq = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
-   currentSeq = currentSeq.lower()
+   currentDNASeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=False)
+   currentDNASeq = currentDNASeq.lower()
 
-   nil = ('','','','','','','')
+   assert currentDNASeq != '', 'load_genomic returned empty sequence!'
 
-   if len(currentSeq) < (currentGene.stop - currentGene.start):
-      return nil
+   seq_has_indels = False
+   for elem in currentReadSeq:
+      #assert elem in extended_alphabet, 'Invalid char (extended_alphabet) in read: \"%s\"' % elem
+      if not elem in extended_alphabet:
+         return nil
+
+      if elem ==  ']':
+         seq_has_indels = True
+
+   for elem in currentDNASeq:
+      #assert elem in alphabet, 'Invalid char in dna: \"%s\"' % elem
+      if not elem in alphabet:
+         return nil
+
+   if strand == '-':
+      try:
+         currentDNASeq = reverse_complement(currentDNASeq)
+      except:
+         print 'invalid char'
+         return nil
+
+   currentDNASeq = 'a' + currentDNASeq
 
-   p_start     = fRead['p_start']
    exon_stop   = fRead['exon_stop']
    exon_start  = fRead['exon_start']
-   p_stop      = fRead['p_stop']
 
-   currentReadSeq = fRead['seq']
-   quality = fRead['prb']
-   spos = fRead['splitpos']
+   # assertions to check whether exon start/stop positions are next to donor
+   # resp. acceptor sites
+   exonEnd     = exon_stop - currentGene.start
+   exonBegin   = exon_start - currentGene.start 
 
-   assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
-   assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
-   assert p_stop - p_start >= 36
+   current_don = currentDNASeq[exonEnd+1:exonEnd+3]
+   current_acc = currentDNASeq[exonBegin-2:exonBegin]
+   if not current_don == 'gt' or current_don == 'gc':
+      return nil
+
+   if not current_acc == 'ag':
+      return nil
+
+   #assert current_don == 'gt' or current_don == 'gc', pdb.set_trace()
+   #assert current_acc == 'ag', pdb.set_trace()
+
+   # assertions for don/acc sites held now we recalculate offset
+   exon_stop   = exon_stop - currentGene.start
+   exon_start  = exon_start - currentGene.start 
+
+   if not seq_has_indels:
+      p_start     = fRead['p_start']
+      exon_stop   = fRead['exon_stop']
+      exon_start  = fRead['exon_start']
+      p_stop      = fRead['p_stop']
+
+      assert (exon_stop - p_start) + (p_stop - exon_start) + 2 == Conf.read_size, 'Invalid read size'
+
+   else:
+      # if we have indels we have to take care of the exons boundaries
+      firstReadSeq   = currentReadSeq[:spos]
+      secondReadSeq  = currentReadSeq[spos:]
+
+      est_fragment = []
+      r_idx = 0
+      while True:
+         if not r_idx < len(currentReadSeq):
+            break
+
+         if currentReadSeq[r_idx] == '[':
+            est_fragment.append(currentReadSeq[r_idx+2])
+            r_idx += 4
+            continue
+
+         est_fragment.append(currentReadSeq[r_idx])
+         r_idx += 1
+
+      dna_fragment_1 = []
+      r_idx = 0
+      while True:
+         if not r_idx < len(firstReadSeq):
+            break
+
+         if firstReadSeq[r_idx] == '[':
+            dna_fragment_1.append(firstReadSeq[r_idx+1])
+            r_idx += 4
+            continue
+
+         dna_fragment_1.append(firstReadSeq[r_idx])
+         r_idx += 1
+
+      dna_fragment_2 = []
+      r_idx = 0
+      while True:
+         if not r_idx < len(secondReadSeq):
+            break
+
+         if secondReadSeq[r_idx] == '[':
+            dna_fragment_2.append(secondReadSeq[r_idx+1])
+            r_idx += 4
+            continue
+
+         dna_fragment_2.append(secondReadSeq[r_idx])
+         r_idx += 1
+
+      currentReadSeq = "".join(est_fragment)
+      dna_fragment_1 = "".join(dna_fragment_1)
+      dna_fragment_2 = "".join(dna_fragment_2)
+
+      dna_fragment_1_size = len([p for p in dna_fragment_1 if p != '-'])
+      dna_fragment_2_size = len([p for p in dna_fragment_2 if p != '-'])
+
+      # new check the fragment of the annotation
+      new_exon1_size = dna_fragment_1_size
+      p_start     = exon_stop - new_exon1_size + 1
+
+      dna_annot_1 = currentDNASeq[p_start:exon_stop+1]
+
+      # new check the fragment of the annotation
+      new_exon2_size = dna_fragment_2_size 
+      p_stop      = exon_start + new_exon2_size - 1
+
+      dna_annot_2 = currentDNASeq[exon_start:p_stop+1]
+
+      #assert dna_fragment_1.replace('-','') == dna_annot_1, pdb.set_trace()
+      #assert dna_fragment_2.replace('-','') == dna_annot_2, pdb.set_trace()
+
+      if not dna_fragment_1.replace('-','') == dna_annot_1:
+         #print 'genomic seq mismatch'
+         #print 'orignal read:\t %s ' % originalReadSeq
+         #print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
+         #print 'dna_fragment_1:\t %s' % dna_fragment_1
+         #print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
+         return nil
+
+      if not dna_fragment_2.replace('-','') == dna_annot_2:
+         #print 'genomic seq mismatch'
+         #print 'orignal read:\t %s ' % originalReadSeq
+         #print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
+         #print 'dna_fragment_2:\t %s' % dna_fragment_2
+         #print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
+         return nil
+
+      #print 'successful'
+
+      for elem in currentReadSeq:
+         assert elem in alphabet, 'Invalid char (alphabet) in read: \"%s\"' % elem
+
+   #assert p_start < exon_stop < exon_start < p_stop, pdb.set_trace()
+   assert p_stop - p_start >= Conf.read_size
 
    currentExons = zeros((2,2),dtype=numpy.int)
 
@@ -163,32 +351,31 @@ def process_read(reReads,fRead,currentGene,chrom,genome_file,sscore_filename):
    currentExons[1,0] = exon_start
    currentExons[1,1] = p_stop
 
-   # make indices relative to start pos of the current gene
-   currentExons -= currentGene.start
-
    # determine cutting positions for sequences
    up_cut   = int(currentExons[0,0])
    down_cut = int(currentExons[1,1])
 
-   # if we perform testing we cut a much wider region as we want to detect
+# if we perform testing we cut a much wider region as we want to detect
    # how good we perform on a region
-   test = False
+   import random
+   ADD_POS = random.randint(Conf.extension[0],Conf.extension[1])
 
    if test:
-      up_cut = up_cut-500
+      up_cut = up_cut-ADD_POS
       if up_cut < 0:
          up_cut = 0
 
-      down_cut = down_cut+500
+      down_cut = down_cut+ADD_POS
 
-      if down_cut > len(currentSeq):
-         down_cut = len(currentSeq)
+      if down_cut > len(currentDNASeq):
+         down_cut = len(currentDNASeq)
 
    # cut out part of the sequence
-   currentSeq = currentSeq[up_cut:down_cut]
+   currentDNASeq = currentDNASeq[up_cut:down_cut]
 
    # ensure that index is correct
    currentExons[0,1] += 1
+   currentExons[1,1] += 1
 
    if test:
       currentExons -= up_cut
@@ -196,27 +383,33 @@ def process_read(reReads,fRead,currentGene,chrom,genome_file,sscore_filename):
       currentExons -= currentExons[0,0]
 
    try:
-      if not (currentSeq[int(currentExons[0,1])] == 'g' and\
-      currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
+      if not (currentDNASeq[int(currentExons[0,1])] == 'g' and\
+      currentDNASeq[int(currentExons[0,1])+1] in ['c','t' ]):
+         print 'not aligned'
          return nil
 
-      if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
+      if not (currentDNASeq[int(currentExons[1,0])-1] == 'g' and currentDNASeq[int(currentExons[1,0])-2] == 'a'):
+         print 'not aligned'
          return nil
 
    except:
-      pdb.set_trace()
+      return nil
+      #pdb.set_trace()
 
    # now we want to use interval_query to get the predicted splice scores
    # trained on the TAIR sequence and annotation
-   
-   interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
+   interval_matrix = createIntArrayFromList([currentGene.start+up_cut-100,currentGene.start+down_cut+100])
    num_fields = 1
    num_intervals = 1
+   sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
+   % (chr,strand)
+
+   ag_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and currentDNASeq[p-1]=='a' and e=='g' ]
 
    # fetch acceptor scores
    gf = Genef()
    num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
-   assert num_hits <= len(currentSeq)
+   assert num_hits <= len(currentDNASeq)
    
    #print 'Acceptor hits: %d' % num_hits
    pos = createIntArrayFromList([0]*num_hits)
@@ -224,12 +417,14 @@ def process_read(reReads,fRead,currentGene,chrom,genome_file,sscore_filename):
    scores = createDoubleArrayFromList([0]*num_hits*num_fields)
    gf.getResults(pos,indices,scores)
 
-   acc  = [-inf]*(len(currentSeq)+1)
-   
+   acc  = [-inf]*(len(currentDNASeq))
+
    for i in range(num_hits):
       position = pos[i] 
       position -= (currentGene.start+up_cut)
-      assert 0 <= position < len(currentSeq)+1, 'position index wrong'
+      if position < 2 or position > len(currentDNASeq)-1:
+         continue
+      assert 0 <= position < len(currentDNASeq), 'position index wrong'
       acc[position] = scores[i]
 
    acc = acc[1:] + [-inf]
@@ -237,9 +432,14 @@ def process_read(reReads,fRead,currentGene,chrom,genome_file,sscore_filename):
    del gf
 
    # fetch donor scores
+   sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s'\
+   % (chr,strand)
+
    gf = Genef()
    num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
-   assert num_hits <= len(currentSeq)
+   assert num_hits <= len(currentDNASeq)
+
+   gt_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and e=='g' and (currentDNASeq[p+1]=='t' or currentDNASeq[p+1]=='c')]
 
    #print 'Donor hits: %d' % num_hits
    pos = createIntArrayFromList([0]*num_hits)
@@ -247,29 +447,53 @@ def process_read(reReads,fRead,currentGene,chrom,genome_file,sscore_filename):
    scores = createDoubleArrayFromList([0]*num_hits*num_fields)
    gf.getResults(pos,indices,scores)
 
-   don     = [-inf]*(len(currentSeq))
+   don     = [-inf]*(len(currentDNASeq))
 
    try:
-      for i in range(1,num_hits):
+      for i in range(num_hits):
          position = pos[i] 
          position -= (currentGene.start+up_cut)
+         if position < 1 or position >= len(currentDNASeq)-1:
+            continue
          don[position-1] = scores[i]
    except:
-      pdb.set_trace()
+      return nil
+      #pdb.set_trace()
 
    don = [-inf] + don[:-1]
 
-   return currentSeq, acc, don, currentExons, currentReadSeq, quality, spos
+   #assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
+   #assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
+   #assert (len(currentDNASeq)) == len(acc), pdb.set_trace()
+   #assert (len(currentDNASeq)) == len(don), pdb.set_trace()
+
+
+   if not ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf]:
+      return nil
+
+   if not gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf]:
+      return nil
+
+   if not (len(currentDNASeq)) == len(acc):
+      return nil
+
+   if not (len(currentDNASeq)) == len(don):
+      return nil
+
+   acc = [-inf] + acc[:-1]
+
+   return currentDNASeq, acc, don, currentExons, currentReadSeq, originalReadSeq, quality, spos
+
 
 def reverse_complement(seq):
    map = {'a':'t','c':'g','g':'c','t':'a'}
 
    new_seq = [map[elem] for elem in seq]
    new_seq.reverse()
+   new_seq = "".join(new_seq)
 
    return new_seq
    
-
 if __name__ == '__main__':
    if len(sys.argv) == 1:
       print info