%include "penalty_info.h"
%include "qpalma_dp.h"
-/*
%array_class(int, intArray) ;
%array_class(double, doubleArray) ;
-%array_functions(double, doubleFArray) ;
%array_functions(struct penalty_struct, penaltyArray) ;
+
+/*
+%array_functions(double, doubleFArray) ;
%array_class(Pre_score, Pre_scoreArray) ;
+*/
%pythoncode
%{
for i in range(array_len):
list[i] = doubleArray_getitem(array,i)
return list
-
%}
-*/
as well as the core binaries.
-Instructions
-------------
+Installation Instructions
+-------------------------
-Type
+You need the following packages
+
+- SWIG ( >=1.3.31 )
+- numpy ( >=1.0.4 )
+
+- Pythongrid
+- Genefinding
+
+You can obtain SWIG from http://www.swig.org and numpy from http://numpy.scipy.org/
+or ask your sysadmin to install the packages.
+
+Pythongrid and Genefinding are two utility packages from our lab. These can be obtained via:
+
+ http://
+
+Once you have the above packages installed you can start installing QPalma.
+
+Assuming you downloaded QPalma-1.0.tar.gz type:
+
+ tar -xzvf QPalma-1.0.tar.gz
+
+ cd QPalma-1.0
python setup.py build
-to build the necessary C++ extension modules and copy the QPalma python code.
+to build the necessary C++ extension modules and the python code.
+
+
+Setting environment variables
+-----------------------------
+
+You will need the following variables set:
+
+- SGE_ROOT should point to
+-
-HELP!
------
-
-If however everything failes contact:
+Contact
+-------
-fabio@tuebingen.mpg.de
+If however everything failes contact: fabio@tuebingen.mpg.de
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import os.path
-import cPickle
-
-#
-# choose a path where all results of the QPalma pipeline will be stored
-#
-
-result_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/prediction'
-
-
-###############################################################################
-# Load a random but fixed initial parameter vector this makes debugging easier
-###############################################################################
-
-#fixedParamQ = cPickle.load(open('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/randInitParam.pickle'))
-
-###############################################################################
-#
-# The parameters for the QPalma algorithm
-#
-#min_intron_len = 20
-#max_intron_len = 2000
-#
-#min_svm_score = 0.0
-#max_svm_score = 1.0
-#
-#min_qual = -5
-#max_qual = 40
-
-numConstraintsPerRound = 50
-
-###############################################################################
-#
-# CHOOSING THE MODE
-#
-# 'normal' means work like Palma 'using_quality_scores' means work like Palma
-# plus using sequencing quality scores
-#
-###############################################################################
-
-#mode = 'normal'
-mode = 'using_quality_scores'
-
-###############################################################################
-#
-# When using quality scores our scoring function is defined as
-#
-# f: S_e x R x S -> R, where S_e is {A,C,G,T,N} and S = {A,C,G,T,N,-}
-#
-# as opposed to a usage without quality scores when we only have
-#
-# f: S x S -> R
-#
-# The matrix of plifs is defined as follows:
-#
-# elem | - a c g t n
-# -------------------------
-# idx | 0 1 2 3 4 5
-#
-# dna
-#
-# - a c g t n
-# a
-# est c
-# g
-# t
-# n
-#
-# so the index can be calculated as (estnum-1)*6 + dnanum.
-# Ests do not have gaps with quality scores so we look up the matchmatrix
-# instead.
-###############################################################################
-
-read_size = 38
-#read_size = 36
-extension = (250,500)
-
-numLengthSuppPoints = 10
-numDonSuppPoints = 10
-numAccSuppPoints = 10
-numQualSuppPoints = 10
-
-if mode == 'normal':
- sizeMatchmatrix = (6,6)
- estPlifs = 0
- dnaPlifs = 0
- numQualPlifs = estPlifs*dnaPlifs
-elif mode == 'using_quality_scores':
- sizeMatchmatrix = (6,1)
- estPlifs = 5
- dnaPlifs = 6
- numQualPlifs = estPlifs*dnaPlifs
-else:
- assert False, 'Wrong operation mode specified'
-
-###############################################################################
-#
-# GENERAL SETTINGS CONCERNING THE SOLVER
-#
-###############################################################################
-
-iter_steps = 40
-remove_duplicate_scores = False
-print_matrix = False
-anzpath = 2
-
-if mode == 'normal':
- #fixedParam = fixedParamQ
- fixedParam = None
-elif mode == 'using_quality_scores':
- fixedParam = None
-else:
- assert False, 'Wrong operation mode specified'
-
-###############################################################################
-#
-# DATA SETTINGS CONCERNING THE SPLITS AND FILE LOCATIONS
-#
-###############################################################################
-training_begin = 0
-training_end = 10000
-
-prediction_begin = 10000
-prediction_end = 40000
-
-joinp = os.path.join
-
-tmp_dir = '/fml/ag-raetsch/home/fabio/tmp/solexa_tmp'
-data_path = '/fml/ag-raetsch/share/projects/qpalma/solexa'
-
-original_path = joinp(data_path,'original_solexa_data')
-annot_path = joinp(data_path,'annotation_data')
-remapped_path = joinp(data_path,'remapped_solexa_data')
-
-dna_flat_fn = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-
-gff_fn = joinp(annot_path,'TAIR7_GFF3_genes_Chr%s.gff_v1')
-
-###############################################################################
-#
-# SANITY CHECKS
-#
-###############################################################################
-#assert numQualPlifs >= 0
-#assert numDonSuppPoints > 1
-#assert numAccSuppPoints > 1
-#assert numLengthSuppPoints > 1
-#assert numQualSuppPoints > 1
-assert os.path.exists(dna_flat_fn), 'DNA data does not exist!'
-
-extended_alphabet = ['-','a','c','g','t','n','[',']']
-alphabet = ['-','a','c','g','t','n']
-
-###############################################################################
-#
-# Settings for the VMatch pipeline steps
-#
-###############################################################################
-
-reads_location = ''
-
-#
-# First VMatch step
-#
-
-mismatches_1 = 2
-end_gap_1 = 0
-repeat_mapping_1 = 1
-seedlength_1 = 9
-suffixtree_1 = '/media/oka_raid/nobackup/data/Vmatch/ATH/ATH1.v5.seed9.fa'
-
-#
-# Second VMatch step
-#
-
-mismatches_2 = 1
-sub_mismatches_2 = 1
-min_short_end_2 = 14
-repeat_mapping_2 = 1
-seedlength_2 = 9
-suffixtree_2 = '/media/oka_raid/nobackup/data/Vmatch/ATH/ATH1.v5.seed9.fa'
-
-#
-#
-# do not modify anything below this line
-#
-#
-
-conf_object_path = os.path.join(result_dir,'config_object.pickle')
saveData('training',dataset,settings)
-def generatePredictionDataset(settings):
- """
- This function is responsible for the prediction dataset generation for the matches of
- the second vmatch run and the heuristic-filtered matches of the first run.
-
- An example in our dataset consists of:
-
- - information on the original read:
- * id
- * nucleotide sequence
- * quality vectors (at the moment: prb and chastity)
-
- - information on the target dna sequence:
- * chromosome
- * strand
- * fragment start
- * fragment end positions
-
- this information is stored in tuples which are stored then in a dictionary
- indexed by the reads unique id.
-
- CAUTION: The dictionary stores a list of the above defined tuples under each
- 'id'. This is because one read can be found several times on the genome.
- """
-
- #map_file = settings['map_file']
- map_file = jp(settings['approximation_dir'],'map.vm.spliced')
+def addToDataset(map_file,dataset,settings):
assert os.path.exists(map_file), 'Error: Can not find map file'
- dataset = {}
-
prb_offset = settings['prb_offset']
# This tuple specifies an interval for valid Illumina Genome Analyzer quality values
print 'Total examples processed: %d' % instance_counter
+ return dataset
+
+
+def generatePredictionDataset(settings):
+ """
+ This function is responsible for the prediction dataset generation for the matches of
+ the second vmatch run and the heuristic-filtered matches of the first run.
+
+ An example in our dataset consists of:
+
+ - information on the original read:
+ * id
+ * nucleotide sequence
+ * quality vectors (at the moment: prb and chastity)
+
+ - information on the target dna sequence:
+ * chromosome
+ * strand
+ * fragment start
+ * fragment end positions
+
+ this information is stored in tuples which are stored then in a dictionary
+ indexed by the reads unique id.
+
+ CAUTION: The dictionary stores a list of the above defined tuples under each
+ 'id'. This is because one read can be found several times on the genome.
+ """
+
+ dataset = {}
+
+ map_file = settings['spliced_reads_fn']
+ dataset = addToDataset(map_file,dataset,settings)
+
+ map_file = jp(settings['approximation_dir'],'map.vm.spliced')
+ dataset = addToDataset(map_file,dataset,settings)
+
saveData('prediction',dataset,settings)
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
import sys
import os
total_size = seqInfo.chromo_sizes[chromo]
start_pos = total_size - start_pos
- starts = [int(elem) + start_pos for elem in starts]
- ends = [int(elem) + start_pos for elem in ends]
+ if strand == '+':
+ starts = [int(elem) + start_pos-2 for elem in starts]
+ ends = [int(elem) + start_pos-2 for elem in ends]
+ else:
+ starts = [int(elem) + start_pos for elem in starts]
+ ends = [int(elem) + start_pos for elem in ends]
if strand == '-':
starts = [e-window_size for e in starts]
def createBlatOutput(current_prediction,settings):
"""
This function takes one prediction as input and returns the corresponding
- alignment in BLAT format.
+ alignment in BLAT format:
+
+ 1. matches - Number of bases that match that aren't repeats
+ 2. misMatches - Number of bases that don't match
+ 3. repMatches - Number of bases that match but are part of repeats
+ 4. nCount - Number of 'N' bases
+ 5. qNumInsert - Number of inserts in query
+ 6. qBaseInsert - Number of bases inserted in query
+ 7. tNumInsert - Number of inserts in target
+ 8. tBaseInsert - Number of bases inserted in target
+ 9. strand - '+' or '-' for query strand. In mouse, second '+'or '-' is for genomic strand
+ 10. qName - Query sequence name
+ 11. qSize - Query sequence size
+ 12. qStart - Alignment start position in query
+ 13. qEnd - Alignment end position in query
+ 14. tName - Target sequence name
+ 15. tSize - Target sequence size
+ 16. tStart - Alignment start position in target
+ 17. tEnd - Alignment end position in target
+ 18. blockCount - Number of blocks in the alignment
+ 19. blockSizes - Comma-separated list of sizes of each block
+ 20. qStarts - Comma-separated list of starting positions of each block in query
+ 21. tStarts - Comma-separated list of starting positions of each block in target
+
"""
# fetch the data needed
self.mmatrix = mmatrix
self.qualityPlifs = qualityPlifs
- self.read_size = 38
- self.prb_offset = 64
- #self.prb_offset = 50
+ self.prb_offset = settings['prb_offset']
# parameters of the heuristics to decide whether the read is spliced
#self.splice_thresh = 0.005
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding:latin-1 -*-
-
-
-class Run(dict):
- """
- This class represents the needed parameters for one single run of an
- algorithm. This means that if we for example perform a model selection over
- C then for each value of C we have to create one Run object storing the
- respective parameters.
- """
-
- def __init__(self):
- pass
-
- def __repr__(self):
-
- result = ""
- for key,val in self.iteritems():
- result += "%s:\t\t %s\n" % (key,str(val))
-
- return result
-
-if __name__ == '__main__':
- r1 = Run()
- r1['attrib_1'] = 12
- r1['attrib_2'] = 22
-
- print "%s" % r1
-
-#!/usr/bin/env python
-# -*- coding:latin-1 -*-
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
import sys
import os
-#!/usr/bin/env python
-# -*- coding:latin-1 -*-
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
import sys
import os
-#!/usr/bin/env python
-# -*- coding:latin-1 -*-
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
import sys
import os
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
import numpy
from numpy.matlib import mat,zeros,ones,inf
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
from numpy import inf
from numpy.matlib import zeros
for idx in range(0,num_splits):
data_fname = jp(result_dir,'map.part_%d'%idx)
- result_fname = jp(result_dir,'map.vm.part_%d.heuristic.spliced'%idx)
+ result_fname = jp(result_dir,'map.vm.part_%d'%idx)
self.result_files.append(result_fname)
current_job = KybJob(gridtools.ApproximationTaskStarter,[run_fname,data_fname,param_fname,result_fname,self.settings])
def collectResults(self):
result_dir = self.settings['approximation_dir']
combined_fn = jp(result_dir,'map.vm.spliced')
+ self.result_files = map(lambda x:x+'.spliced',self.result_files)
combine_files(self.result_files,combined_fn)
combine_files([combined_fn,self.settings['spliced_reads_fn']],'map.vm')
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import pdb
-import os
-import sys
-import mmap
-import resource
-
-class ReadParser:
- """
- A base class for the Solexa reads parsers.
- """
-
- def __init__(self,filename):
- self.fh = open(filename)
-
- def __iter__(self):
- return self
-
- def next(self):
- pass
-
- def parse(self):
- lines = []
-
- for elem in self:
- lines.append(elem)
-
- return lines
-
-
-def parse(filename):
- entries = []
-
- fh = open(filename)
-
- all_lines = fh.read()
- all_lines = all_lines.split('\n')
-
- for parsed_line in all_lines:
- if parsed_line == '':
- continue
-
- id,chr,pos,strand,mismatches,length,offset,seq,prb,cal_prb,chastity =\
- parsed_line.split()
- chr = int(chr)
- pos = int(pos)
-
- mismatches = int(mismatches)
- length = int(length)
- offset = int(offset)
-
- line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
- 'mismatches':mismatches, 'length':length, 'offset':offset,\
- 'seq':seq,'prb':prb,'cal_prb':cal_prb,'chastity':chastity}
-
- entries.append((line_d,parsed_line))
-
- return entries
-
-
- def parse(self):
- entries = {}
-
- for elem in self.fh:
- line_d = self.parseLine(elem)
- id = line_d['id']
- assert id not in entries, pdb.set_trace()
- entries[id] = line_d
-
- return entries
-
-class RemappedReadParser(ReadParser):
- """
- This class offers a parser for the reads that are remapped by the vmatch
- utility.
-
- According to the docu the entries are:
-
- ID, Chromosome, Position, Orientation (D or P), Mismatches, Alignment length, Offset, Alignment
-
- """
-
- def __init__(self,filename):
- ReadParser.__init__(self,filename)
-
- def parseLine(self,line):
- """
- We assume that a line has the following entires:
-
- """
- id,chr1,pos1,seq1,chr2,pos2,seq2,quality = line.split()
-
- chr1 = int(chr1)
- chr2 = int(chr2)
-
- seq1=seq1.lower()
- seq2=seq2.lower()
-
- pos1 = int(pos1)
- pos2 = int(pos2)
-
- line_d = {'id':id, 'chr1':chr1, 'pos1':pos1, 'seq1':seq1, 'chr2':chr2,\
- 'pos2':pos2, 'seq2':seq2, 'quality':'quality'}
-
- return line_d
-
- def next(self):
- for line in self.fh:
- line = line.strip()
- yield self.parseLine(line)
-
- raise StopIteration
-
- def parse(self):
- entries = {}
-
- for elem in self.fh:
- line_d = self.parseLine(elem)
- id = line_d['id']
- try:
- entries[id] = [line_d]
- except:
- old_entry = entries[id]
- old_entry.append(line_d)
- entries[id] = old_entry
-
- return entries
-
-
-class MapParser(ReadParser):
- """
- This class offers a parser for the reads that are remapped by the vmatch
- utility.
-
- According to the docu the entries are:
-
- ID, Chromosome, Position, Orientation (D or P), Mismatches, Alignment length, Offset, Alignment
-
- """
-
- def __init__(self,filename):
- ReadParser.__init__(self,filename)
-
- def parseLine(self,line):
- """
- We assume that a line has the following entries:
-
- """
-
- id,chr,pos,strand,mismatches,length,offset,seq = line.split()
-
- chr = int(chr)
- pos = int(pos)
-
- if strand == 'D':
- strand = '+'
-
- if strand == 'P':
- strand = '-'
-
- mismatches = int(mismatches)
- length = int(length)
- offset = int(offset)
-
- seq=seq.lower()
-
- line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
- 'mismatches':mismatches, 'length':length, 'offset':offset,\
- 'seq':seq}
-
- return line_d
-
- def next(self):
- for line in self.fh:
- line = line.strip()
- yield self.parseLine(line)
-
- raise StopIteration
-
- def parse(self):
- entries = {}
-
- for elem in self.fh:
- line_d = self.parseLine(elem)
- id = line_d['id']
- try:
- entries[id] = [line_d]
- except:
- old_entry = entries[id]
- old_entry.append(line_d)
- entries[id] = old_entry
-
- return entries
-
-
-
-def parse_map_vm_heuristic(filename):
- """
- This function parse the map.vm file for the PipelineHeuristics
- """
- entries = []
-
- fh = open(filename)
-
- all_lines = fh.read()
- all_lines = all_lines.split('\n')
-
- for parsed_line in all_lines:
- if parsed_line == '':
- continue
-
- id,chr,pos,strand,mismatches,length,offset,seq,prb,cal_prb,chastity =\
- parsed_line.split()
- chr = int(chr)
- pos = int(pos)
-
- mismatches = int(mismatches)
- length = int(length)
- offset = int(offset)
-
- line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
- 'mismatches':mismatches, 'length':length, 'offset':offset,\
- 'seq':seq,'prb':prb,'cal_prb':cal_prb,'chastity':chastity}
-
- entries.append((line_d,parsed_line))
-
- return entries
-
-def cpu():
- return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
- resource.getrusage(resource.RUSAGE_SELF).ru_stime)
-
-class Line:
- pass
-
-def parse_filtered_reads(filename):
- """
-
- """
- entries = {}
-
- #fh = open(filename)
- #all_lines = fh.read()
- #all_lines = all_lines.split('\n')
-
- #for parsed_line in all_lines:
- # if parsed_line == '':
- # continue
-
- print 'asking for map'
- data = map_file(filename)
- print 'obtained map'
-
- strand_map = ['-','+']
-
- full_line_split_time = 0
- full_array_time = 0
- full_dict_time = 0
-
- parsing_start = cpu()
-
- print 'start parsing'
-
- while True:
-
- start = cpu()
- parsed_line = data.readline().strip()
-
- if parsed_line == '':
- break
-
- id,chr,strand,seq,splitpos,read_size,_prb,_cal_prb,_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = parsed_line.split()
- stop = cpu()
-
- full_line_split_time += stop-start
-
- id = int(id)
- splitpos = int(splitpos)
- read_size = int(read_size)
-
- seq=seq.lower()
-
- #assert strand in ['D','P']
-
- strand = strand_map[strand == 'D']
-
- chr = int(chr)
-
- start_array_time = cpu()
-
- prb = [0]*read_size
- cal_prb = [0]*read_size
- chastity = [0]*read_size
-
- for idx in range(read_size):
- prb[idx] = ord(_prb[idx])-50
- cal_prb[idx] = ord(_cal_prb[idx])-64
- chastity[idx] = ord(_chastity[idx])+10
-
- #import array
-
- #prb = array.array('b',_prb)
- #cal_prb = array.array('b',_cal_prb)
- #chastity = array.array('b',_chastity)
-
- #for idx in range(read_size):
- # prb[idx] -= 50
- # cal_prb[idx] -= 64
- # chastity[idx] += 10
-
- #assert prb == __prb
- #assert cal_prb == __cal_prb
- #assert chastity == __chastity
-
- stop_array_time = cpu()
-
- full_array_time += stop_array_time - start_array_time
-
-
- start_dict_time = cpu()
-
- p_start = int(p_start)
- exon_stop = int(exon_stop)
- exon_start = int(exon_start)
- p_stop = int(p_stop)
- true_cut = int(true_cut)
-
- line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
- 'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
- 'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
- 'p_stop':p_stop,'true_cut':true_cut}
-
- stop_dict_time = cpu()
-
- full_dict_time += stop_dict_time - start_dict_time
-
- entries[id] = line_d
-
- parsing_stop = cpu()
-
- print 'parsing took %f secs' % (stop-start)
- print 'line split time %f ' % full_line_split_time
- print 'array time %f ' % full_array_time
- print 'dict time %f ' % full_dict_time
-
- return entries
-
-
-def parse_map_vm(filename):
- """
- Parses the map.vm files from Vmatch
-
-
- For an original read we can have several vmatch locations
- """
- entries = {}
-
- #fh = open(filename)
- #all_lines = fh.read()
- #all_lines = all_lines.split('\n')
-
- #for parsed_line in all_lines:
- # if parsed_line == '':
- # continue
-
- data = map_file(filename)
-
- while True:
-
- parsed_line = data.readline()
-
- if parsed_line == '':
- break
-
- id,chr,pos,strand,mismatches,length,offset,seq,q1,q2,q3 = parsed_line.split()
-
- id = int(id)
-
- chr = int(chr)
- pos = int(pos)
-
- if strand == 'D':
- strand = '+'
-
- if strand == 'P':
- strand = '-'
-
- mismatches = int(mismatches)
- length = int(length)
- offset = int(offset)
-
- seq=seq.lower()
-
- line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
- 'mismatches':mismatches, 'length':length, 'offset':offset}
-
- if entries.has_key(id):
- #pdb.set_trace()
- old_entry = entries[id]
- old_entry.append(line_d)
- entries[id] = old_entry
- else:
- entries[id] = [line_d]
-
- return entries
-
-
-def map_file(filename):
- """
- As the mmapped file object behaves just like plain file objects
- you do not have to unmap but just close the file.
- """
-
- file = open(filename, "r+")
- size = os.path.getsize(filename)
-
- data = mmap.mmap(file.fileno(), size, mmap.ACCESS_READ)
- assert size == len(data), 'Could not map whole file at once!'
-
- return data
from numpy.matlib import inf
-from Genefinding import doIntervalQuery
+from Genefinding import doIntervalQuery,new_intp,intp_assign
+
+from QPalmaDP import createIntArrayFromList
def load_genomic(chromosome, strand, start, stop, genome, one_based=True):
# Before creating a candidate spliced read dataset we have to first filter
# the matches from the first seed finding run.
- approx_task = ApproximationTask(self.settings)
- approx_task.CreateJobs()
- approx_task.Submit()
- approx_task.CheckIfTaskFinished()
+ #approx_task = ApproximationTask(self.settings)
+ #approx_task.CreateJobs()
+ #approx_task.Submit()
+ #approx_task.CheckIfTaskFinished()
# After filtering combine the filtered matches from the first run and the
# found matches from the second run to a full dataset
+++ /dev/null
-acgtacgtagcgatctgctcgtacgtgctcgatcaggcagtczzuuctgt
import pdb
import unittest
-from qpalma_main import QPalma
+from qpalma.qpalma_main import QPalma
from qpalma.utils import print_prediction
-from createAlignmentFileFromPrediction import alignment_reconstruct
+from qpalma.Run import Run
+
+from qpalma.OutputFormat import alignment_reconstruct
from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo
def testTableThalianaData(self):
- g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+ """
+ """
+
+ g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+ g_fmt = 'chr%d.dna.flat'
+ s_fmt = 'contig_%d%s'
- g_fmt = 'chr%d.dna.flat'
- s_fmt = 'contig_%d%s'
+ settings = {}
+ settings['genome_dir'] = g_dir
+ settings['acceptor_scores_loc'] = acc_dir
+ settings['donor_scores_loc'] = don_dir
- num_chromo = 2
+ settings['genome_file_fmt'] = g_fmt
+ settings['splice_score_file_fmt']= s_fmt
- lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,num_chromo))
+ settings['allowed_fragments'] = [1]
- accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
- seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+ lt1 = LookupTable(settings)
+
+ accessWrapper = DataAccessWrapper(settings)
+ seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
seq = 'CTGGCCAAAAGCTCAGGGAAGACGCAGCCTAGGGCTCC'.lower()
seq = reverse_complement(seq)
strand = '-'
pos = 10475515
dna = seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
+
self.assertEqual(seq,dna)
- dna = lt1.get_seq_and_scores(chromo,strand,pos,pos+38,True)
- pdb.set_trace()
+ full_info = lt1.get_seq_and_scores(chromo,strand,pos,pos+38)
+ dna = full_info[0]
+
self.assertEqual(seq,dna)
dna,acc,don = seqInfo.get_seq_and_scores(1,'+',1,1369)