import pdb
import re
import subprocess
+import numpy
from numpy.matlib import inf
+def my_load_genomic(fname, strand, start, stop, one_based=True):
+ if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
+ assert(len(start) == len(stop))
+ assert((start[1:]-stop[:-1]>0).all())
+ if strand == '+':
+ idx = xrange(len(start))
+ else:
+ idx = xrange(len(start)-1,-1,-1)
+
+ seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
+ for ix in idx])
+ return seq
+
+ f=file(fname)
+ if one_based:
+ f.seek(start-1)
+ str=f.read(stop-start+1)
+ else:
+ f.seek(start)
+ str=f.read(stop-start)
+
+ if strand=='-':
+ return reverse_complement(str)
+ elif strand=='+':
+ return str
+ else:
+ print 'strand must be + or -'
+ raise KeyError
+
+
+class DataAccessWrapper:
+ """
+ The purpose of this class is to wrap the genomic/splice site score data
+ access.
+
+
+
+ """
+
+
+ def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt):
+ self.genomic_dir = genomic_data_dir
+ self.acc_score_dir = acc_score_dir
+ self.don_score_dir = don_score_dir
+ self.genomic_fmt = gen_fmt
+ self.sscore_fmt = sscore_fmt
+
+ assert os.path.isdir(genomic_data_dir)
+ assert os.path.isdir(acc_score_dir)
+ assert os.path.isdir(don_score_dir)
+
+
+ def get_genomic_fragment_fn(self,id,strand):
+ if strand == '+':
+ return os.path.join(self.genomic_dir,self.genomic_fmt%id)
+ elif strand == '-':
+ return os.path.join(self.genomic_dir,(self.genomic_fmt%id)+'.neg')
+ else:
+ assert False
+
+
+ def get_splice_site_scores_fn(self,id,strand):
+ acc_fn = os.path.join(self.acc_score_dir,self.sscore_fmt%(id,strand))
+ don_fn = os.path.join(self.don_score_dir,self.sscore_fmt%(id,strand))
+ return acc_fn,don_fn
+
+
+
class SeqSpliceInfo():
+ """
+
+ """
- def __init__(self,dna_flat_files,chromo_list):
- assert os.path.isdir(dna_flat_files)
- self.flat_files = dna_flat_files
+ def __init__(self,accessWrapper,id_list):
+ self.accessWrapper = accessWrapper
self.chromo_sizes = {}
- print "Fetching chromosome sizes..."
- for chromo in chromo_list:
- filename = os.path.join(self.flat_files,'chr%d.dna.flat' % chromo)
+ print "Fetching fragment sizes..."
+ for chromo in id_list:
+ filename = self.accessWrapper.get_genomic_fragment_fn(chromo,'+')
total_size = get_flat_file_size(filename)
self.chromo_sizes[chromo] = total_size
total_size = self.chromo_sizes[chromo]
+ acc_fn, don_fn = self.accessWrapper.get_splice_site_scores_fn(chromo,strand)
+
# fetch acceptor scores
- sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
- acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
+ acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,acc_fn,total_size)
# fetch donor scores
- sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
- don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
+ don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,don_fn,total_size)
return acc, don
def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
- chromo_string = 'chr%d' % chromo
- genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
+ #chromo_string = 'chr%d' % chromo
+ #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
+ filename = self.accessWrapper.get_genomic_fragment_fn(chromo,strand)
+ genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
genomicSeq = genomicSeq.strip().lower()
# check the obtained dna sequence
assert genomicSeq_start < genomicSeq_stop
- if strand == '-':
- genomicSeq = self.fetch_sequence(chromo+7,'+',genomicSeq_start,genomicSeq_stop)
- else:
- genomicSeq = self.fetch_sequence(chromo,'+',genomicSeq_start,genomicSeq_stop)
+ genomicSeq = self.fetch_sequence(chromo,strand,genomicSeq_start,genomicSeq_stop)
if only_seq:
return genomicSeq
currentAcc = currentAcc[:len(genomicSeq)]
currentDon = currentDon[:len(genomicSeq)]
- #length = len(genomicSeq)
- #currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
- #currentDon = currentDon+[-inf]*(length-len(currentDon))
+ length = len(genomicSeq)
+ currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
+ currentDon = currentDon+[-inf]*(length-len(currentDon))
currentDon[-1] = -inf
gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
print ag_tuple_pos[:10],ag_tuple_pos[-10:]
print acc_pos[:10],acc_pos[-10:]
- #pdb.set_trace()
+ pdb.set_trace()
if not gt_tuple_pos == don_pos:
print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
print gt_tuple_pos[:10],gt_tuple_pos[-10:]
print don_pos[:10],don_pos[-10:]
- #pdb.set_trace()
+ pdb.set_trace()
#assert ag_tuple_pos == acc_pos, pdb.set_trace()
#assert gt_tuple_pos == don_pos, pdb.set_trace()