+ fixed bug in position calculation for negative strand
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 7 Aug 2008 14:47:01 +0000 (14:47 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 7 Aug 2008 14:47:01 +0000 (14:47 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@10352 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/createAlignmentFileFromPrediction.py
tools/run_specific_scripts/transcriptome_analysis/createExonInfoForGenefinding.py [new file with mode: 0755]

index 885149c..64fb471 100644 (file)
@@ -2,12 +2,12 @@
 # -*- coding: utf-8 -*- 
 
 import cPickle
-import sys
-import pdb
-import os
-import os.path
 import math
 import numpy
+import os
+import os.path
+import pdb
+import sys
 
 import qparser
 
@@ -16,6 +16,8 @@ from Utils import pprint_alignment
 import palma.palma_utils as pu
 from palma.output_formating import print_results
 
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
+
 
 def alignment_reconstruct(current_prediction,num_exons):
    """
@@ -61,12 +63,11 @@ def alignment_reconstruct(current_prediction,num_exons):
       
       exonIdentities[idx] = identity
       exonGaps[idx] = gaps
-
+   
    return exonIdentities,exonGaps
 
 
 def create_alignment_file(current_loc,out_fname):
-   print 'Working on %s'%current_loc
 
    out_fh = open(out_fname,'w+')
    allPredictions = cPickle.load(open(current_loc))
@@ -74,6 +75,25 @@ def create_alignment_file(current_loc,out_fname):
 
    print 'Loaded %d examples' % len(allPredictions)
 
+   # fetch the data needed
+   g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+   acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+   don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+   g_fmt = 'chr%d.dna.flat'
+   s_fmt = 'contig_%d%s'
+
+   num_chromo = 6
+
+   accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+   seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+
+   # implement uniqueness filtering using alignment score for reads with
+   # several alignments
+
+   #for pred_idx,current_prediction in enumerate(allPredictions):
+   
+
    written_lines = 0
    for pred_idx,current_prediction in enumerate(allPredictions):
       id = current_prediction['id']
@@ -94,13 +114,12 @@ def create_alignment_file(current_loc,out_fname):
       start_pos   = current_prediction['start_pos']
       alignment   = current_prediction['alignment']
 
-      DPScores   = current_prediction['DPScores']
-
-      predExons = current_prediction['predExons']
-      predExons = [e+start_pos for e in predExons]
+      DPScores    = current_prediction['DPScores']
+      predExons   = current_prediction['predExons']
+      predExons   = [e+start_pos for e in predExons]
 
       (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
-      tExonSizes,tStarts, tEnds) = alignment 
+      tExonSizes,tStarts, tEnds) = alignment
 
       if len(qExonSizes) != num_exons:
          print 'BUG exon sizes %d'%id
@@ -164,13 +183,34 @@ def create_alignment_file(current_loc,out_fname):
          blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
 
       else:
-
-         try:
-            exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
-         except:
-            print 'Bug in example %d (idx: %d)' %\
-            (current_prediction['id'],pred_idx)
-            continue
+         #try:
+         exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
+         #except:
+         #   print 'Bug in example %d (idx: %d)' %\
+         #   (current_prediction['id'],pred_idx)
+         #   continue
+
+         #t_size = tEnd - tStart
+         #read_size = 38
+         #if strand == '-':
+         #   total_size = seqInfo.chromo_sizes[chromo]
+
+         #   start_pos = total_size - start_pos
+
+         #   qExonSizes.reverse()
+         #   qStarts  = [read_size-e for e in qEnds]
+         #   qStarts.reverse()
+         #   qEnds  = [read_size-e for e in qStarts]
+         #   qEnds.reverse()
+
+         #   tExonSizes.reverse()
+         #   tStarts  = [t_size-e for e in tEnds]
+         #   tStarts.reverse()
+         #   tEnds    = [t_size-e for e in tStarts]
+         #   tEnds.reverse()
+
+         #   exonIdentities.reverse()
+         #   exonGaps.reverse()
 
          pp = lambda x : str(x)[1:-1].replace(' ','')
 
@@ -180,12 +220,13 @@ def create_alignment_file(current_loc,out_fname):
          pp(qEnds),pp(tExonSizes),\
          pp(tStarts),pp(tEnds),\
          pp(exonIdentities),pp(exonGaps))
-         written_lines += 1
 
       out_fh.write(new_line)
+      written_lines += 1
 
    print 'Wrote out %d lines' % written_lines
 
+
 if __name__ == '__main__':
    current_dir = sys.argv[1]
    out_fh = sys.argv[2]
diff --git a/tools/run_specific_scripts/transcriptome_analysis/createExonInfoForGenefinding.py b/tools/run_specific_scripts/transcriptome_analysis/createExonInfoForGenefinding.py
new file mode 100755 (executable)
index 0000000..eaf4fd6
--- /dev/null
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+
+import os
+import sys
+
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
+
+def run(chunk_dir,outfile):
+   #chunk_dir = '/fml/ag-raetsch/home/fabio/tmp/sandbox/alignment'
+
+   result_fn=os.path.join(chunk_dir,'all_chunks.txt')
+   cmd = "rm -rf %s; for chunk in `ls -1 %s/chunk*align_remap`; do cat $chunk >> %s; done"%\
+   (result_fn,chunk_dir,result_fn)
+   os.system(cmd)
+
+   out_fh = open(outfile,'w+')
+
+   # fetch the data needed
+   g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+   acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+   don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+   g_fmt = 'chr%d.dna.flat'
+   s_fmt = 'contig_%d%s'
+
+   num_chromo = 6
+
+   accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+   seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+
+   pp = lambda x : str(x)[1:-1].replace(' ','')
+
+   for line in open(result_fn):
+      sl = line.split()
+
+      chromo = int(sl[1])
+      strand = sl[2]
+      start_pos = int(sl[5])-2
+      starts   = sl[15].split(',')
+      ends     = sl[16].split(',')
+
+      ids = sl[-2].split(',')
+      ids   = [int(elem) for elem in ids]
+      gaps = sl[-1].split(',')
+      gaps   = [int(elem) for elem in gaps]
+
+      if strand == '+':
+         pass
+
+      elif strand == '-':
+         total_size = seqInfo.chromo_sizes[chromo]
+         start_pos = total_size - start_pos
+
+         #t_size = 3001
+         #starts  = [t_size-int(e) for e in ends]
+         #starts.reverse()
+         #ends    = [t_size-int(e) for e in starts]
+         #ends.reverse()
+
+         #start_pos = total_size - start_pos
+         #starts   = [total_size - elem for elem in starts]
+         #starts.reverse()
+         #ends     = [total_size - elem for elem in ends]
+         #ends.reverse()
+
+      else:
+         assert False
+
+      starts   = [int(elem) + start_pos for elem in starts]
+      ends     = [int(elem) + start_pos for elem in ends]
+
+      #out_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),str(start_pos),pp(ids),pp(gaps)))
+      out_fh.write("%d\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),pp(ids),pp(gaps)))
+
+if __name__ == '__main__':
+   run(sys.argv[1],sys.argv[2])