recursive-include doc *.tex Makefile
recursive-include scripts *.py
recursive-include qpalma *.py
+recursive-include tools *.py
recursive-include tests *.py
include test.conf
import os.path
import pdb
-from sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
+from sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
jp = os.path.join
seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
for line in open(map_file):
- if line.startswith('#'):
+ line = line.strip()
+ if line.startswith('#') or line == '':
continue
if instance_counter > 0 and instance_counter % 5000 == 0:
if pos+half_window_size < seqInfo.getFragmentSize(chromo):
ds_offset = half_window_size
else:
- ds_offset = seqInfo.chromo_sizes[chromo]-pos-1
+ ds_offset = seqInfo.getFragmentSize(chromo)-pos-1
genomicSeq_start = pos - us_offset
genomicSeq_stop = pos + ds_offset
translation = '-acgtn_'
# convenience function for pretty-printing arrays
-pp = lambda x : str(x)[1:-1].replace(' ','')
+pp = lambda x : str(x)[1:-1].replace(' ',' ')
def alignment_reconstruct(current_prediction,num_exons):
wise we just have to add an offset i.e. the start position.
"""
+ print 'window_size is %d' % window_size
+ print start_pos
+ print starts
+ print ends
+
if strand == '-':
- total_size = seqInfo.getFragmentSize(chromo)
- start_pos = total_size - start_pos
+ _ends = [window_size - int(elem) + 2 for elem in starts]
+ _starts = [window_size - int(elem) + 2 for elem in ends]
+ starts = _starts
+ ends = _ends
- if strand == '+':
- starts = [int(elem) + start_pos-1 for elem in starts]
- ends = [int(elem) + start_pos-1 for elem in ends]
- else:
- starts = [int(elem) + start_pos for elem in starts]
- ends = [int(elem) + start_pos for elem in ends]
+ print start_pos
+ print starts
+ print ends
- if strand == '-':
- starts = [e-window_size for e in starts]
- ends = [e-window_size for e in ends]
+ starts = [int(elem) + start_pos-1 for elem in starts]
+ ends = [int(elem) + start_pos-1 for elem in ends]
return starts,ends
gaps = exonGaps
gaps = [int(elem) for elem in gaps]
- new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
- (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+ new_line = '%d\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
+ (id,chromo,strand,seq,start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
pp(qExonSizes),pp(qStarts),pp(qEnds),pp(tExonSizes),\
pp(tStarts),pp(tEnds),pp(ids),pp(gaps))
total_size = self.chromo_sizes[chromo]
- if strand == '-':
- s_start = total_size - genomicSeq_stop
- s_stop = total_size - genomicSeq_start
- genomicSeq_start = s_start
- genomicSeq_stop = s_stop
+ print 'genomicSeq: ' + str(genomicSeq_start),str(genomicSeq_stop)
assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
assert genomicSeq_start >= 0
genomicSeq_start = s_start
genomicSeq_stop = s_stop
+
assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
genomicSeq = self.fetch_sequence(id,strand,genomicSeq_start,genomicSeq_stop)
import cPickle
import random
+import sys
import math
import numpy
from numpy import inf
print 'Problem counter is %d' % qp.problem_ctr
-def check_reverse_strand_calculation(id,b,e,seqInfo):
- seq,acc,don = seqInfo.get_seq_and_scores(id,'-',b,e,only_seq=False,perform_checks=False)
+def check_reverse_strand_calculation(id,b,e,seqInfo):
total_size = seqInfo.getFragmentSize(1)
bp = total_size - e
ep = total_size - b
+
+ seq,acc,don = seqInfo.get_seq_and_scores(id,'-',b,e,only_seq=False,perform_checks=False)
seqp,acc,don = seqInfo.get_seq_and_scores(id,'+',bp,ep,only_seq=False,perform_checks=False)
seqp = reverse_complement(seqp)
- return (seq == seqp)
+ res1 = (seq == seqp)
+ #print seq
+ #print seqp
+
+ seq,acc,don = seqInfo.get_seq_and_scores(id,'+',b,e,only_seq=False,perform_checks=False)
+ seq,acc,don = seqInfo.get_seq_and_scores(id,'-',bp,ep,only_seq=False,perform_checks=False)
+ #seqp = reverse_complement(seq)
+
+ res2 = (seq == seqp)
+ print seq
+ print seqp
+
+ return res1,res2
+
def check_example(chromo,strand,b,e,seqInfo,lt1):
dna,acc,don = seqInfo.get_seq_and_scores(1,strand,b,e)
for (b,e) in [(206,874),(545,874),(999,1234),(1000,total_size-1000),(3,total_size-3),(0,total_size)]:
check_example(chromo,strand,b,e,seqInfo,lt1)
- for (b,e) in intervals:
- print 'Rev strand calculation: %s'%str(check_reverse_strand_calculation(1,b,e,seqInfo))
+ #for (b,e) in intervals:
+ # r1,r2 = check_reverse_strand_calculation(1,b,e,seqInfo)
+ # print b,e
+ # print 'Rev strand calculation: %s %s'%(str(r1),str(r2))
def checks():
accessWrapper = DataAccessWrapper(settings)
seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
- #lt1 = None
- lt1 = LookupTable(settings)
+ lt1 = None
+ #lt1 = LookupTable(settings)
print 'Checking with toy data...'
simple_check(settings,seqInfo,lt1)
#print 'Checking with real data...'
#simple_check(settings,seqInfo,lt1)
+ b = 30427463
+ e = 30427563
+ seq,acc,don = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=False,perform_checks=True)
+ print seq
+
def run():
print 'Creating some artifical data...'
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import array
+
+def dumpSpliceScoreInformation(in_fn):
+
+ pos_fn = in_fn + '.pos'
+ scores_fn = in_fn + '.Conf'
+
+ positions = array.array('I')
+ positions.fromfile(open(pos_fn),60)
+
+ scores = array.array('f')
+ scores.fromfile(open(scores_fn),60)
+
+ print [p for p in positions if p < 560]
+ #print scores[:10]
+
+
+if __name__ == '__main__':
+ in_fn = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_1+'
+ dumpSpliceScoreInformation(in_fn)
+
+ in_fn = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_1+'
+ dumpSpliceScoreInformation(in_fn)
+
+ in_fn = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_1-'
+ dumpSpliceScoreInformation(in_fn)
+
+ in_fn = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_1-'
+ dumpSpliceScoreInformation(in_fn)