+ added convenient script
[qpalma.git] / tools / run_specific_scripts / transcriptome_analysis / createExonInfoForGenefinding.py
1 #!/usr/bin/env python
2
3 import os
4 import sys
5
6 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
7
8 def run(chunk_dir,outfile):
9 #chunk_dir = '/fml/ag-raetsch/home/fabio/tmp/sandbox/alignment'
10
11 result_fn=os.path.join(chunk_dir,'all_chunks.txt')
12 cmd = "rm -rf %s; for chunk in `ls -1 %s/chunk*align_remap`; do cat $chunk >> %s; done"%\
13 (result_fn,chunk_dir,result_fn)
14 os.system(cmd)
15
16 out_fh = open(outfile,'w+')
17
18 # fetch the data needed
19 g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
20 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
21 don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
22
23 g_fmt = 'chr%d.dna.flat'
24 s_fmt = 'contig_%d%s'
25
26 num_chromo = 6
27
28 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
29 seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
30
31 pp = lambda x : str(x)[1:-1].replace(' ','')
32
33 for line in open(result_fn):
34 sl = line.split()
35
36 chromo = int(sl[1])
37 strand = sl[2]
38 start_pos = int(sl[5])-2
39 starts = sl[15].split(',')
40 ends = sl[16].split(',')
41
42 ids = sl[-2].split(',')
43 ids = [int(elem) for elem in ids]
44 gaps = sl[-1].split(',')
45 gaps = [int(elem) for elem in gaps]
46
47 if strand == '+':
48 pass
49
50 elif strand == '-':
51 total_size = seqInfo.chromo_sizes[chromo]
52 start_pos = total_size - start_pos
53
54 #t_size = 3001
55 #starts = [t_size-int(e) for e in ends]
56 #starts.reverse()
57 #ends = [t_size-int(e) for e in starts]
58 #ends.reverse()
59
60 #start_pos = total_size - start_pos
61 #starts = [total_size - elem for elem in starts]
62 #starts.reverse()
63 #ends = [total_size - elem for elem in ends]
64 #ends.reverse()
65
66 else:
67 assert False
68
69 starts = [int(elem) + start_pos for elem in starts]
70 ends = [int(elem) + start_pos for elem in ends]
71
72 #out_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),str(start_pos),pp(ids),pp(gaps)))
73 out_fh.write("%d\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),pp(ids),pp(gaps)))
74
75
76 cmd = 'rm %s' % result_fn
77 os.system(cmd)
78
79 if __name__ == '__main__':
80 run(sys.argv[1],sys.argv[2])