+ extended scripts
[qpalma.git] / scripts / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import random
6 import os
7 import re
8 import pdb
9 import pydb
10 import io_pickle
11
12 import numpy
13 from numpy.matlib import mat,zeros,ones,inf
14
15 import qpalma
16 import qpalma.tools
17 from qpalma.tools.parseGff import parse_gff
18
19 from qpalma.parsers import FilteredReadParser, RemappedReadParser, MapParser
20
21 from Genefinding import *
22
23 from genome_utils import load_genomic
24
25 import qpalma.Configuration as Conf
26
27 warning = """
28 #########################################################
29 WARNING ! Disabled the use of non-bracket reads WARNING !
30 #########################################################
31
32 """
33
34
35 help = """
36
37 Usage of this script is:
38
39 ./compile_dataset.py gff_file dna_flat_files solexa_reads remapped_reads dataset_file
40
41 """
42
43 info = """
44 The purpose of this script is to compile a full data set for QPalma out of the
45 following files:
46
47 - The information of the gff (gene annotation).
48
49 - The flat dna files with the sequence of the respective gene positions
50
51 - The solexa reads in the filtered version
52
53 - The solexa reads in the remapped version
54
55 This represents a dataset for use with QPalma
56 The entries are def'd as follows:
57
58 - 'id' a unique id for the read
59 - 'chr' the chromosome identifier
60 - 'strand' the identifier for the strand either '+' or '-'
61
62 - 'read_start'
63 - 'exon_stop'
64 - 'exon_start'
65 - 'read_stop'
66
67 - 'remappped_pos' - the position of the remapped read
68 """
69
70 # some useful constants
71
72 extended_alphabet = ['-','a','c','g','t','n','[',']']
73 alphabet = ['-','a','c','g','t','n']
74
75 # some dummy elements to be returned if current example violates assertions
76 nil_dna_frag = ('','','','','')
77 remapped_nil = ('')
78 process_nil = ('','','','','','','','')
79
80 nil_splice_scores = ('','')
81
82 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
83
84 assert os.path.exists(filtered_reads)
85 assert os.path.exists(remapped_reads)
86 assert not os.path.exists(dataset_file), 'The data_file already exists!'
87
88 joinp = os.path.join
89
90 # first read the gff file(s) and create pickled gene information
91 allGenes = [None]*6
92
93 for i in range(1,6):
94 allGenes[i]= parse_gff(gff_file%i) #,joinp(tmp_dir,'gff_info.pickle'))
95
96 print 'parsing filtered reads..'
97 frp = FilteredReadParser(filtered_reads)
98 all_filtered_reads = frp.parse()
99
100 print 'found %d filtered reads' % len(all_filtered_reads)
101
102 #print 'parsing remapped reads'
103 #rrp = RemappedReadParser(remapped_reads)
104 #all_remapped_reads = rrp.parse()
105
106 print 'parsing map reads'
107 rrp = MapParser(remapped_reads)
108 all_remapped_reads = rrp.parse()
109
110 if all_remapped_reads != None:
111 size_rem = len(all_remapped_reads)
112 else:
113 size_rem = 0
114
115 print 'found %d remapped reads' % size_rem
116
117 # this stores the new dataset
118 Sequences = []
119 Acceptors = []
120 Donors = []
121 Exons = []
122 Ests = []
123 OriginalEsts = []
124 Qualities = []
125 UpCut = []
126 StartPos = []
127 SplitPositions = []
128 Ids = []
129 FReads = []
130 RemappedReads = []
131
132 AlternativeSequences = []
133
134 # Iterate over all remapped reads in order to generate for each read an
135 # training / prediction example
136 instance_counter = 0
137
138 for fId,currentFRead in all_filtered_reads.items():
139
140 reId = str(1000000000000+int(fId))
141 try:
142 reReads = all_remapped_reads[reId]
143 except:
144 continue
145
146 try:
147 gene_id = currentFRead['gene_id']
148 chro = currentFRead['chr']
149 currentGene = allGenes[chro][gene_id]
150 except:
151 continue
152
153 if currentFRead['strand'] != '+':
154 #print 'wrong strand'
155 continue
156
157 seq, acc, don, exons, est, qual, up_cut, start_pos =\
158 process_filtered_read(currentFRead,currentGene,dna_flat_files,test)
159
160 original_est = currentFRead['seq']
161
162 #alternative_seq = process_remapped_reads(reReads,currentFRead,dna_flat_files)
163 alternative_seq = process_map(reReads,currentFRead,dna_flat_files)
164
165 #check_remapped_reads(reReads,currentFRead,dna_flat_files)
166
167 if seq == '':
168 continue
169
170 Sequences.append(seq)
171 Acceptors.append(acc)
172 Donors.append(don)
173 Exons.append(exons)
174 Ests.append(est)
175 OriginalEsts.append(original_est)
176 Qualities.append(qual)
177 UpCut.append(up_cut)
178 StartPos.append(start_pos)
179 AlternativeSequences.append(alternative_seq)
180
181 instance_counter += 1
182
183 if instance_counter % 1000 == 0:
184 print 'processed %d examples' % instance_counter
185
186 if instance_counter == 2000:
187 break
188
189 print 'Full dataset has size %d' % len(Sequences)
190
191 dataset = [Sequences, Acceptors, Donors,\
192 Exons, Ests, OriginalEsts, Qualities,\
193 UpCut, StartPos, AlternativeSequences]
194
195 #dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
196 #'Exons':Exons, 'Ests':Ests, 'OriginalEsts':OriginalEsts, 'Qualities':Qualities,\
197 #'AlternativeSequences':AlternativeSequences,\
198 #'AlternativeAcceptors':AlternativeAcceptors,\
199 #'AlternativeDonors':AlternativeDonors}
200
201 #'Ids':Ids, 'FReads':FReads}
202
203 # saving new dataset
204 io_pickle.save(dataset_file,dataset)
205
206
207 def process_filtered_read(fRead,currentGene,dna_flat_files,test):
208 """
209 Creates a training example
210
211 """
212
213 quality = fRead['prb']
214 #quality = fRead['cal_prb']
215 #quality = fRead['chastity']
216
217 spos = fRead['splitpos']
218 chr = fRead['chr']
219 strand = fRead['strand']
220
221 # use matlab-style functions to access dna sequence. we fetch a slice that
222 # is a bit bigger than the genome in case the read starts at the very beginning
223 chrom = 'chr%d' % fRead['chr']
224 genomicSeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=False)
225 genomicSeq = genomicSeq.lower()
226
227 # add a leading 'a' for index reasons
228 genomicSeq = 'a' + genomicSeq
229
230 # check the obtained dna sequence
231 assert genomicSeq != '', 'load_genomic returned empty sequence!'
232 for elem in genomicSeq:
233 assert elem in alphabet, pdb.set_trace()
234
235 if strand == '-':
236 try:
237 genomicSeq = reverse_complement(genomicSeq)
238 except:
239 print 'invalid char'
240 return process_nil
241
242 # fetch the dna fragment that represents the part of the two exons
243 dnaFragment,currentExons,up_cut,down_cut, currentReadSeq = getDNAFragment(currentGene,genomicSeq,fRead)
244
245 if dnaFragment == '':
246 return process_nil
247
248 intervalBegin = currentGene.start+up_cut-100
249 intervalEnd = currentGene.start+down_cut+100
250
251 seq_pos_offset = currentGene.start+up_cut
252
253 don, acc = getSpliceScores(chr,strand,intervalBegin,intervalEnd,dnaFragment,seq_pos_offset )
254
255 return dnaFragment, acc, don, currentExons, currentReadSeq, quality, up_cut,\
256 currentGene.start+up_cut
257
258
259 def getDNAFragment(currentGene,currentDNASeq,fRead):
260 """
261 This function calculates the dna fragment that corresponds to the two exon
262 parts of the read
263
264 """
265 #print 'entering getDNAFragment...'
266
267 exon_stop = fRead['exon_stop']
268 exon_start = fRead['exon_start']
269
270 currentReadSeq = fRead['seq']
271
272 # assertions to check whether exon start/stop positions are next to donor
273 # resp. acceptor sites
274 exonEnd = exon_stop - currentGene.start
275 exonBegin = exon_start - currentGene.start
276
277 current_don = currentDNASeq[exonEnd+1:exonEnd+3]
278 current_acc = currentDNASeq[exonBegin-2:exonBegin]
279
280 if not (current_don == 'gt' or current_don == 'gc'):
281 print 'fragment not aligned'
282 #pydb.debugger()
283 return nil_dna_frag
284
285 if not current_acc == 'ag':
286 print 'fragment not aligned'
287 #pydb.debugger()
288 return nil_dna_frag
289
290 #assert current_don == 'gt' or current_don == 'gc', pdb.set_trace()
291 #assert current_acc == 'ag', pdb.set_trace()
292 # assertions for don/acc sites held now we recalculate offset
293 #assert (exon_stop - p_start) + (p_stop - exon_start) + 2 == Conf.read_size, 'Invalid read size'
294
295 # now we check whether the read sequence contains indels. if so we have to recalculate the boundaries
296 # if not we use p_start and p_stop directly
297 seq_has_indels = False
298 for elem in currentReadSeq:
299 #assert elem in extended_alphabet, 'Invalid char (extended_alphabet) in read: \"%s\"' % elem
300 if not elem in extended_alphabet:
301 print 'incorrect alphabet'
302 return nil_dna_frag
303
304 if elem == ']':
305 seq_has_indels = True
306
307 if not seq_has_indels:
308 p_start = fRead['p_start']
309 p_stop = fRead['p_stop']
310 else:
311 spos = fRead['splitpos']
312 # if we have indels we have to take care of the exons boundaries
313 firstReadSeq = currentReadSeq[:spos]
314 secondReadSeq = currentReadSeq[spos:]
315
316 est_fragment = []
317 r_idx = 0
318 while True:
319 if not r_idx < len(currentReadSeq):
320 break
321
322 if currentReadSeq[r_idx] == '[':
323 est_fragment.append(currentReadSeq[r_idx+2])
324 r_idx += 4
325 continue
326
327 est_fragment.append(currentReadSeq[r_idx])
328 r_idx += 1
329
330 dna_fragment_1 = []
331 r_idx = 0
332 while True:
333 if not r_idx < len(firstReadSeq):
334 break
335
336 if firstReadSeq[r_idx] == '[':
337 dna_fragment_1.append(firstReadSeq[r_idx+1])
338 r_idx += 4
339 continue
340
341 dna_fragment_1.append(firstReadSeq[r_idx])
342 r_idx += 1
343
344 dna_fragment_2 = []
345 r_idx = 0
346 while True:
347 if not r_idx < len(secondReadSeq):
348 break
349
350 if secondReadSeq[r_idx] == '[':
351 dna_fragment_2.append(secondReadSeq[r_idx+1])
352 r_idx += 4
353 continue
354
355 dna_fragment_2.append(secondReadSeq[r_idx])
356 r_idx += 1
357
358 currentReadSeq = "".join(est_fragment)
359 dna_fragment_1 = "".join(dna_fragment_1)
360 dna_fragment_2 = "".join(dna_fragment_2)
361
362 dna_fragment_1_size = len([p for p in dna_fragment_1 if p != '-'])
363 dna_fragment_2_size = len([p for p in dna_fragment_2 if p != '-'])
364
365 # new check the fragment of the annotation
366 new_exon1_size = dna_fragment_1_size
367 p_start = exon_stop - new_exon1_size + 1
368
369 dna_annot_1 = currentDNASeq[p_start-currentGene.start:exon_stop+1-currentGene.start]
370
371 # new check the fragment of the annotation
372 new_exon2_size = dna_fragment_2_size
373 p_stop = exon_start + new_exon2_size - 1
374
375 dna_annot_2 = currentDNASeq[exon_start-currentGene.start:p_stop+1-currentGene.start]
376
377 #assert dna_fragment_1.replace('-','') == dna_annot_1, pdb.set_trace()
378 #assert dna_fragment_2.replace('-','') == dna_annot_2, pdb.set_trace()
379
380 if not dna_fragment_1.replace('-','') == dna_annot_1:
381 print 'genomic seq mismatch'
382 #pydb.debugger()
383 print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
384 print 'dna_fragment_1:\t %s' % dna_fragment_1
385 print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
386 return nil_dna_frag
387
388 if not dna_fragment_2.replace('-','') == dna_annot_2:
389 print 'genomic seq mismatch'
390 #pydb.debugger()
391 print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
392 print 'dna_fragment_2:\t %s' % dna_fragment_2
393 print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
394 return nil_dna_frag
395
396 for elem in currentReadSeq:
397 assert elem in alphabet, 'Invalid char (alphabet) in read: \"%s\"' % elem
398
399 assert p_stop - p_start >= Conf.read_size
400
401 # now that we have p_start and p_stop properly
402 # we can define the correct exons borders
403 currentExons = zeros((2,2),dtype=numpy.int)
404
405 currentExons[0,0] = p_start
406 currentExons[0,1] = exon_stop
407
408 currentExons[1,0] = exon_start
409 currentExons[1,1] = p_stop
410
411 currentExons -= currentGene.start
412
413 # determine cutting positions for sequences
414 # we cut out a slightly bigger region as we want to learn the read start and
415 # stop positions as well
416 ADD_POS = random.randint(Conf.extension[0],Conf.extension[1])
417
418 if currentExons[0,0] <= ADD_POS:
419 up_cut = currentExons[0,0]
420 else:
421 up_cut = currentExons[0,0] - ADD_POS
422
423 if currentGene.stop - currentExons[1,1] <= ADD_POS:
424 down_cut = currentGene.stop - currentExons[1,1]
425 else:
426 down_cut = currentExons[1,1] + ADD_POS
427
428 # make exox positions start relative to dna sequence start
429 currentExons -= up_cut
430
431 # cut out part of the sequence
432 currentDNASeq = currentDNASeq[up_cut:down_cut]
433
434 # ensure that index is correct
435 currentExons[0,1] += 1
436 currentExons[1,1] += 1
437
438 try:
439 if not (currentDNASeq[int(currentExons[0,1])] == 'g' and currentDNASeq[int(currentExons[0,1])+1] in ['c','t' ]):
440 print 'not aligned'
441 return nil_dna_frag
442
443 if not (currentDNASeq[int(currentExons[1,0])-1] == 'g' and currentDNASeq[int(currentExons[1,0])-2] == 'a'):
444 print 'not aligned'
445 return nil_dna_frag
446 except:
447 print 'error with exon boundaries'
448 return nil_dna_frag
449
450 #print 'leaving getDNAFragment...'
451
452 # end of exons/read borders calculation
453 return currentDNASeq, currentExons, up_cut, down_cut, currentReadSeq
454
455
456 def getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_offset):
457 """
458 Now we want to use interval_query to get the predicted splice scores trained
459 on the TAIR sequence and annotation
460 """
461
462 #print 'entering getSpliceScores...'
463
464 ag_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and currentDNASeq[p-1]=='a' and e=='g' ]
465 gt_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and e=='g' and (currentDNASeq[p+1]=='t' or currentDNASeq[p+1]=='c')]
466
467 acc = len(currentDNASeq)*[0.0]
468 don = len(currentDNASeq)*[0.0]
469
470 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
471 pos_size = new_intp()
472 intp_assign(pos_size,1)
473
474 # now fetch acceptor and donor scores:
475
476 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
477 % (chr,strand)
478
479 # fetch acceptor scores
480 result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
481
482 num_hits = result[0]
483 assert num_hits <= len(currentDNASeq)
484 pos = result[1]
485 scores = result[3]
486
487 acc = [-inf]*(len(currentDNASeq))
488
489 for i in range(num_hits):
490 position = pos[i]
491 position -= seq_pos_offset
492 if position < 2 or position > len(currentDNASeq)-1:
493 continue
494 assert 0 <= position < len(currentDNASeq), 'position index wrong'
495 acc[position] = scores[i]
496
497 acc = acc[1:] + [-inf]
498
499 # fetch donor scores
500 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s'\
501 % (chr,strand)
502
503 result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
504
505 num_hits = result[0]
506 assert num_hits <= len(currentDNASeq)
507 pos = result[1]
508 scores = result[3]
509
510 #print 'Donor hits: %d' % num_hits
511 don = [-inf]*(len(currentDNASeq))
512
513 try:
514 for i in range(num_hits):
515 position = pos[i]
516 position -= seq_pos_offset
517 if position < 1 or position >= len(currentDNASeq)-1:
518 continue
519 don[position-1] = scores[i]
520 except:
521 print 'problem with donor scores'
522 return nil_splice_scores
523
524 don = [-inf] + don[:-1]
525
526 #assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
527 #assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
528 #assert (len(currentDNASeq)) == len(acc), pdb.set_trace()
529 #assert (len(currentDNASeq)) == len(don), pdb.set_trace()
530
531 if not ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf]:
532 print 'problem with acceptor scores'
533 pydb.debugger()
534 return nil_splice_scores
535
536 if not gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf]:
537 print 'problem with donor scores'
538 return nil_splice_scores
539
540 if not (len(currentDNASeq)) == len(acc):
541 return nil_splice_scores
542
543 if not (len(currentDNASeq)) == len(don):
544 return nil_splice_scores
545
546 acc = [-inf] + acc[:-1]
547
548 #print 'leaving getSpliceScores...'
549
550 return don, acc
551
552 def process_map(reReads,fRead,dna_flat_files):
553 """
554 For all matches found by Vmatch we calculate the fragment of the DNA we
555 would like to perform an aligment during prediction.
556 """
557
558 fragment_size = 3000
559
560 alternativeSeq = []
561
562 for currentRRead in reReads:
563 rId = currentRRead['id']
564 pos = currentRRead['pos']
565 chr = currentRRead['chr']
566 strand = currentRRead['strand']
567 seq = currentRRead['seq']
568
569 if strand != '+':
570 continue
571
572 length = currentRRead['length']
573 offset = currentRRead['offset']
574
575 currentLabel = False
576
577 # vmatch found the correct position
578 if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start'] <= pos <= fRead['p_stop']:
579 currentLabel = True
580
581 # first half of the read matches somewhere
582 # search downstream for the rest
583 if offset == 0:
584 genomicSeq_start = pos
585 genomicSeq_stop = pos + fragment_size
586 alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
587 continue
588
589 # second half of the read matches somewhere
590 # search upstream for the rest
591 elif offset != 0 and length + offset == Conf.read_size:
592 genomicSeq_start = pos - fragment_size
593 genomicSeq_stop = pos+length
594 alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
595 continue
596
597 # middle part of the read matches somewhere
598 # search up/downstream for the rest
599 else:
600 genomicSeq_start = pos
601 genomicSeq_stop = pos + fragment_size
602 alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
603
604 genomicSeq_start = pos - fragment_size
605 genomicSeq_stop = pos+length
606 alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
607 continue
608
609 return alternativeSeq
610
611
612 def process_remapped_reads(reReads,fRead,dna_flat_files):
613 """
614 For all matches found by Vmatch we calculate the fragment of the DNA we
615 would like to perform an aligment during prediction.
616 """
617
618 fragment_size = 3000
619
620 seq = fRead['seq']
621 strand = fRead['strand']
622 pos = fRead['p_start']
623 chr = fRead['chr']
624
625 alternativeSeq = []
626
627 for currentRRead in reReads:
628 rId = currentRRead['id']
629
630 pos1 = currentRRead['pos1']
631 chr1 = currentRRead['chr1']
632 seq1 = currentRRead['seq1']
633
634 s1_pos = seq.find(seq1)
635
636 seq2 = currentRRead['seq2']
637
638 if not seq2 == '':
639 pos2 = currentRRead['pos2']
640 chr2 = currentRRead['chr2']
641 s2_pos = seq.find(seq2)
642
643 # vmatch found a correct position
644 if pos + s1_pos == pos1:
645 genomicSeq_start = pos1
646 genomicSeq_stop = pos1 + fragment_size
647 currentLabel = True
648 else:
649 genomicSeq_start = pos1
650 genomicSeq_stop = pos1 + fragment_size
651 currentLabel = False
652
653 alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
654
655 return alternativeSeq
656
657 def check_remapped_reads(reReads,fRead,dna_flat_files):
658 """
659 For all matches found by Vmatch we calculate the fragment of the DNA we
660 would like to perform an aligment during prediction.
661 """
662
663 fragment_size = 3000
664
665 seq = fRead['seq']
666 strand = fRead['strand']
667 pos = fRead['p_start']
668 chr = fRead['chr']
669
670 print 'fRead:\t%d %d %d %d' %\
671 (fRead['p_start'],fRead['exon_stop'],fRead['exon_start'],fRead['p_stop'])
672
673 for currentRRead in reReads:
674 rId = currentRRead['id']
675
676 pos1 = currentRRead['pos1']
677 chr1 = currentRRead['chr1']
678 seq1 = currentRRead['seq1']
679
680 seq2 = currentRRead['seq2']
681 pos2 = 0
682
683 if not seq2 == '':
684 pos2 = currentRRead['pos2']
685 chr2 = currentRRead['chr2']
686 s2_pos = seq.find(seq2)
687
688 print 'reRead:\t %d %d %d %d'%(pos1,pos1+len(seq1),pos2,pos2+len(seq2))
689
690
691 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
692 """
693 This function expects an interval, chromosome and strand information and
694 returns then the genomic sequence of this interval and the associated scores.
695 """
696
697 chrom = 'chr%d' % chr
698 genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
699 genomicSeq = genomicSeq.lower()
700
701 # check the obtained dna sequence
702 assert genomicSeq != '', 'load_genomic returned empty sequence!'
703 #for elem in genomicSeq:
704 # if not elem in alphabet:
705
706 no_base = re.compile('[^acgt]')
707 genomicSeq = no_base.sub('n',genomicSeq)
708
709 intervalBegin = genomicSeq_start-100
710 intervalEnd = genomicSeq_stop+100
711 currentDNASeq = genomicSeq
712 seq_pos_offset = genomicSeq_start
713
714 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_offset)
715
716 return currentDNASeq, currentAcc, currentDon
717
718 def reverse_complement(seq):
719 map = {'a':'t','c':'g','g':'c','t':'a'}
720
721 new_seq = [map[elem] for elem in seq]
722 new_seq.reverse()
723 new_seq = "".join(new_seq)
724
725 return new_seq
726
727
728 if __name__ == '__main__':
729 if len(sys.argv) == 1:
730 print info
731
732 assert len(sys.argv) == 6, help
733 compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)