+ added script to generate datasets
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Sat, 2 Feb 2008 15:06:08 +0000 (15:06 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Sat, 2 Feb 2008 15:06:08 +0000 (15:06 +0000)
+ added parsers for the solexa reads

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7668 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/Configuration.py
qpalma/DataProc.py
qpalma/parsers.py [new file with mode: 0644]
qpalma/tools/parseGff.py
scripts/compile_dataset.py [new file with mode: 0644]

index 3d06aa0..e613511 100644 (file)
@@ -424,6 +424,8 @@ prediction_end    = 2200
 
 joinp = os.path.join
 
+tmp_dir = '/fml/ag-raetsch/home/fabio/tmp/solexa_tmp'
+
 data_path      = '/fml/ag-raetsch/share/projects/qpalma/solexa'  
 
 dna_filename   = joinp(data_path,'allGenes.pickle')
@@ -433,11 +435,7 @@ remapped_path  = joinp(data_path,'remapped_solexa_data')
 annot_path     = joinp(data_path,'annotation_data')
 original_path  = joinp(data_path,'original_solexa_data')
 
-import tools
-from tools.parseGff import createGffPickle
-
-createGffPickle(joinp(annot_path,'TAIR7_GFF3_genes_Chr1.gff_v1'),dna_filename)
-
+joinp(annot_path,'TAIR7_GFF3_genes_Chr1.gff_v1')
 ###############################################################################
 #
 # SANITY CHECKS
index 71b7573..5d02c6b 100644 (file)
@@ -14,6 +14,9 @@ import sys
 from genome_utils import load_genomic
 
 def paths_load_data_solexa(expt,genome_info,PAR,test=False):
+   """
+
+   """
    # expt can be 'training','validation' or 'test'
    assert expt in ['training','validation','test']
 
@@ -32,6 +35,8 @@ def paths_load_data_solexa(expt,genome_info,PAR,test=False):
    Qualities = []
    SplitPositions = []
 
+   rrp = RemappedReadParser('est_filename')
+
    # Iterate over all reads
    for line in open(est_filename):
       line = line.strip()
@@ -209,10 +214,6 @@ def paths_load_data_solexa(expt,genome_info,PAR,test=False):
 
 
 def paths_load_data_pickle(expt,genome_info,PAR):
-   """
-      
-   """
-
    # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
    # Load the relevant file and return the alignment data
 
@@ -310,64 +311,6 @@ def paths_load_data(expt,genome_info,PAR):
       Exons     = data['TrainExon']   # exon boundaries
       Ests      = data['TrainEsts']   # est sequences
      
-   #elif expt == 'validation':
-   #  print('Loading validation data\n') ;
-   #  if PAR.microexon,
-   #    if PAR.LOCAL_ALIGN
-   #      %local version
-   #      load(sprintf('%s/microexon_val_data_cut_local_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
-   #         genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob),  ...
-   #      'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
-   #    else
-   #      %global version
-   #      load(sprintf('%s/microexon_val_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
-   #         genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob),  ...
-   #      'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
-   #    end
-   #  else
-   #    load(sprintf('%s/exons_val_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
-   #       genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob),  ...
-   #    'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
-   #  end
-   #  
-   #  Sequences = Val ;      % dna sequences
-   #  Acceptors = ValAcc ;   % acceptor scores
-   #  Donors    = ValDon ;   % donor scores
-   #  Exons     = ValExon ;  % exon boundaries
-   #  Ests      = ValEsts ;  % est sequences
-
-
-
-   #elif expt == 'test':
-   #  fprintf('Loading test data\n') ;
-   #  if PAR.microexon,
-   #    if PAR.LOCAL_ALIGN
-   #      %local version
-   #      load(sprintf('%s/microexon_test_data_cut_local_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
-   #         genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
-   #      'TestEsts', 'TestExon', 'Test', 'TestAcc', 'TestDon') ;
-   #    else
-   #      %global version
-   #      load(sprintf('%s/microexon_test_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
-   #         genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob),  ...
-   #      'TestEsts', 'TestExon', 'Test','TestAcc', 'TestDon', 'TestNoise') ;
-   #      Noises    = TestNoise ; % substitution matrix
-   #      end
-   #      else
-   #    load(sprintf('%s/exons_test_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
-   #       genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
-   #    'TestEsts', 'TestExon', 'Test', 'TestAcc', 'TestDon') ;
-   #  end
-   #  
-   #  Sequences = Test ;      % dna sequences
-   #  Acceptors = TestAcc ;   % acceptor scores
-   #  Donors    = TestDon ;   % donor scores
-   #  Exons     = TestExon ;  % exon boundaries
-   #  Ests      = TestEsts ;  % est sequences
-
-   # Lower all indices by one to convert matlab 
-   # to python indices
-
    Exons -= 1
 
    return Sequences, Acceptors, Donors, Exons, Ests, Noises
diff --git a/qpalma/parsers.py b/qpalma/parsers.py
new file mode 100644 (file)
index 0000000..072debd
--- /dev/null
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+class ReadParser:
+   """
+   A base class for the Solexa reads parsers.
+   """
+
+   def __init__(self,filename):
+      self.fh = open(filename)
+
+   def __iter__(self):
+      return self
+
+   def next(self):
+      pass
+
+
+class FilteredReadParser(ReadParser):
+   """
+   This class offers a parser for the reads that are created by the first
+   filtering step performed to cut and join new reads.
+   """
+
+   def __init__(self,filename):
+      ReadParser.__init__(filename)
+
+   def parseLine(self,line):
+      """
+      We assume that a line has the following entires:
+
+      """
+      return id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
+
+   def next(self):
+      for line in self.fh:
+         line = line.strip()
+          yield self.parseLine
+
+      raise StopIteration
+
+
+class RemappedReadParser(ReadParser):
+   """
+   This class offers a parser for the reads that are remapped by the vmatch
+   utility. 
+   """
+
+   def __init__(self,filename):
+      ReadParser.__init__(filename)
+
+   def parseLine(self,line):
+      """
+      We assume that a line has the following entires:
+
+      """
+      return id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
+
+   def next(self):
+      for line in self.fh:
+         line = line.strip()
+          yield self.parseLine
+
+      raise StopIteration
index 8baabc6..daeffe9 100644 (file)
@@ -9,7 +9,7 @@ from PyGff import *
 import cPickle
 import copy
 
-def parse(gff_fid):
+def parse_gff(gff_fid):
    reader = csv.reader(gff_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
 
    allGenes = {}
@@ -93,7 +93,7 @@ def parse(gff_fid):
 def createGffPickle(annotFile,pickleFile):
    gff_fid = open(annotFile)
    pickle_fid = open(pickleFile,'w+')
-   allGenes = parse(gff_fid)
+   allGenes = parse_gff(gff_fid)
    #for key,val in allGenes.iteritems():
       #print key
    cPickle.dump(allGenes,pickle_fid)
diff --git a/scripts/compile_dataset.py b/scripts/compile_dataset.py
new file mode 100644 (file)
index 0000000..88596e8
--- /dev/null
@@ -0,0 +1,272 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import pdb
+import io_pickle
+
+import qpalma.tools
+from qpalma.tools.parseGff import parse_gff
+
+from qpalma.parsers import FilteredReadParser, RemappedReadParser
+
+help = """
+
+   Usage of this script is:
+
+   ./compile_dataset.py gff_file  dna_flat_files solexa_reads remapped_reads dataset_file
+
+   """
+
+info = """
+   The purpose of this script is to compile a full data set for QPalma out of the
+   following files: 
+
+   - The information of the gff (gene annotation).
+
+   - The flat dna files with the sequence of the respective gene positions
+
+   - The solexa reads in the filtered version 
+
+   - The solexa reads in the remapped version 
+
+   This represents a dataset for use with QPalma
+   The entries are def'd as follows:
+
+   - 'id' a unique id for the read
+   - 'chr' the chromosome identifier
+   - 'strand' the identifier for the strand either '+' or '-'
+
+   - 'read_start' 
+   - 'exon_stop'
+   - 'exon_start'
+   - 'read_stop'
+
+   - 'remappped_pos' - the position of the remapped read
+   """
+
+def compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,tmp_dir,dataset_file):
+
+   assert os.path.exists(gff_file)
+   for file in dna_flat_files:
+      assert os.path.exists(file)
+
+   assert os.path.exists(solexa_reads)
+   assert os.path.exists(remapped_reads)
+
+   assert not os.path.exists(dataset_file), 'The data_file already exists!'
+
+   joinp = os.path.join
+
+   # first read the gff file(s) and create pickled gene information
+   allGenes = parse_gff(gff_file,joinp(tmp_dir,'gff_info.pickle'))
+   
+   dna_filename = Conf.dna_filename
+   est_filename = Conf.est_filename
+
+   Sequences = []
+   Acceptors = []
+   Donors = []
+   Exons = []
+   Ests = []
+   Qualities = []
+   SplitPositions = []
+
+   frp = FilteredReadParser('')
+   rrp = RemappedReadParser('est_filename')
+
+
+   # Iterate over all reads
+   for line in open(est_filename):
+      line = line.strip()
+      #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
+      #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
+      id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
+
+      splitpos = int(splitpos)
+      length = int(length)
+      prb = [ord(elem)-50 for elem in prb]
+      cal = [ord(elem)-64 for elem in cal]
+      chastity = [ord(elem)+10 for elem in chastity]
+
+      assert len(prb) == len(seq)
+
+      currentGene = allGenes[gene_id]
+      seq = seq.lower()
+
+      #try:
+      #   currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
+      #except:
+      #   continue
+
+      genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+      chrom    = 'chr1'
+
+      # use matlab-style functions to access dna sequence
+      dna = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
+      currentSeq = dna
+
+      if len(currentSeq) < (currentGene.stop - currentGene.start):
+         continue
+
+      # compare both sequences
+      #assert dna == currentSeq, pdb.set_trace()
+   
+      p_start = int(p_start)
+      exon_stop = int(exon_stop)
+      exon_start = int(exon_start)
+      p_stop = int(p_stop)
+
+      assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
+      assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
+      assert p_stop - p_start >= 36
+   
+      currentExons = zeros((2,2))
+
+      currentExons[0,0] = p_start
+      currentExons[0,1] = exon_stop
+
+      currentExons[1,0] = exon_start
+      currentExons[1,1] = p_stop
+
+      # make indices relative to start pos of the current gene
+      currentExons -= currentGene.start
+
+      # determine cutting positions for sequences
+      up_cut   = int(currentExons[0,0])
+      down_cut = int(currentExons[1,1])
+
+      # if we perform testing we cut a much wider region as we want to detect
+      # how good we perform on a region
+      if test:
+         up_cut = up_cut-500
+         if up_cut < 0:
+            up_cut = 0
+
+         down_cut = down_cut+500
+
+         if down_cut > len(currentSeq):
+            down_cut = len(currentSeq)
+
+      # cut out part of the sequence
+      currentSeq = currentSeq[up_cut:down_cut]
+
+      # ensure that index is correct
+      currentExons[0,1] += 1
+
+      if test:
+         currentExons -= up_cut
+      else:
+         currentExons -= currentExons[0,0]
+
+      try:
+         if not (currentSeq[int(currentExons[0,1])] == 'g' and\
+         currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
+            continue
+
+         if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
+            continue
+
+      except:
+         pdb.set_trace()
+
+      Sequences.append(currentSeq)
+
+      # now we want to use interval_query to get the predicted splice scores
+      # trained on the TAIR sequence and annotation
+
+      from Genefinding import *
+      
+      interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
+      num_fields = 1
+      num_intervals = 1
+
+      # fetch acceptor scores
+      fname =  '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_1+' 
+      gf = Genef()
+      num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
+      assert num_hits <= len(currentSeq)
+      
+      #print 'Acceptor hits: %d' % num_hits
+
+      pos = createIntArrayFromList([0]*num_hits)
+      indices = createIntArrayFromList([0]*num_hits)
+      scores = createDoubleArrayFromList([0]*num_hits*num_fields)
+      gf.getResults(pos,indices,scores)
+
+      currentAcceptors  = [-inf]*(len(currentSeq)+1)
+      
+      for i in range(num_hits):
+         position = pos[i] 
+         position -= (currentGene.start+up_cut)
+         assert 0 <= position < len(currentSeq)+1, 'position index wrong'
+         currentAcceptors[position] = scores[i]
+
+      currentAcceptors = currentAcceptors[1:] + [-inf]
+      Acceptors.append(currentAcceptors)
+
+      del gf
+
+      # fetch donor scores
+      fname =  '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+' 
+      gf = Genef()
+      num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
+      assert num_hits <= len(currentSeq)
+
+      #print 'Donor hits: %d' % num_hits
+      pos = createIntArrayFromList([0]*num_hits)
+      indices = createIntArrayFromList([0]*num_hits)
+      scores = createDoubleArrayFromList([0]*num_hits*num_fields)
+      gf.getResults(pos,indices,scores)
+
+      currentDonors     = [-inf]*(len(currentSeq))
+
+      try:
+         for i in range(1,num_hits):
+            position = pos[i] 
+            position -= (currentGene.start+up_cut)
+            currentDonors[position-1] = scores[i]
+      except:
+         pdb.set_trace()
+
+      currentDonors = [-inf] + currentDonors[:-1]
+      Donors.append(currentDonors)
+
+      Exons.append(currentExons)
+      Ests.append(seq)
+      Qualities.append(prb)
+
+      SplitPositions.append(splitpos)
+
+      #if len(Sequences[-1]) == 2755:
+      #   Sequences = Sequences[:-1]
+      #   Acceptors = Acceptors[:-1]
+      #   Donors    = Donors[:-1]
+      #   Exons     = Exons[:-1]
+      #   Ests      = Ests[:-1]
+      #   Qualities = Qualities[:-1]
+      #   SplitPositions = SplitPositions[:-1]
+   
+   #print 'found %d examples' % len(Sequences)
+
+   return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
+
+
+      
+
+   dataset = []
+
+   {'id':'', 'chr':'', '', 'strand':,  = line.split()
+
+   
+   io_pickle.save(dataset_file,dataset)
+
+
+if __name__ == '__main__':
+
+   if len(sys.argv) == 1:
+      print info
+
+   assert len(sys.argv) == 6, help
+
+   compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file):