+ added read structure for new parser
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Sat, 2 Feb 2008 14:03:56 +0000 (14:03 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Sat, 2 Feb 2008 14:03:56 +0000 (14:03 +0000)
+ added missing strand check
+ changin map parser to read remapped reads

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7667 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/Configuration.py
qpalma/DataProc.py
qpalma/tools/parseGff.py
tools/data_tools/datastructures.h
tools/data_tools/filterReads.c
tools/data_tools/read.h [new file with mode: 0644]

index d35c6c4..3d06aa0 100644 (file)
@@ -422,9 +422,21 @@ training_end      = 1500
 prediction_begin  = 1500
 prediction_end    = 2200
 
-dna_filename         = '/fml/ag-raetsch/share/projects/qpalma/solexa/allGenes.pickle'
-est_filename         = '/fml/ag-raetsch/share/projects/qpalma/solexa/filteredReads_recent'
-tair7_seq_filename   = '/fml/ag-raetsch/share/projects/qpalma/solexa/tair7_seq.pickle'
+joinp = os.path.join
+
+data_path      = '/fml/ag-raetsch/share/projects/qpalma/solexa'  
+
+dna_filename   = joinp(data_path,'allGenes.pickle')
+est_filename   = joinp(data_path,'remapped_solexa_data/map_best_hit.18.unambig')
+
+remapped_path  = joinp(data_path,'remapped_solexa_data')
+annot_path     = joinp(data_path,'annotation_data')
+original_path  = joinp(data_path,'original_solexa_data')
+
+import tools
+from tools.parseGff import createGffPickle
+
+createGffPickle(joinp(annot_path,'TAIR7_GFF3_genes_Chr1.gff_v1'),dna_filename)
 
 ###############################################################################
 #
@@ -439,5 +451,5 @@ assert numQualSuppPoints   > 1
 
 assert os.path.exists(dna_filename), 'DNA data does not exist!'
 assert os.path.exists(est_filename), 'EST/Reads data does not exist!'
-assert os.path.exists(tair7_seq_filename), 'Sequence data does not exist!'
+#assert os.path.exists(tair7_seq_filename), 'Sequence data does not exist!'
 
index f123dd7..71b7573 100644 (file)
@@ -5,7 +5,8 @@ from numpy.matlib import mat,zeros,ones,inf
 import cPickle
 import pdb
 import Configuration as Conf
-from PyGff import *
+import tools
+from tools.PyGff import *
 import io_pickle
 import scipy.io
 import sys
@@ -19,8 +20,7 @@ def paths_load_data_solexa(expt,genome_info,PAR,test=False):
    dna_filename = Conf.dna_filename
    est_filename = Conf.est_filename
 
-   #tair7_seq_filename = Conf.tair7_seq_filename 
-   #tair7_seq = cPickle.load(open(tair7_seq_filename))
+   pdb.set_trace()
 
    allGenes  = cPickle.load(open(dna_filename))
 
@@ -35,7 +35,10 @@ def paths_load_data_solexa(expt,genome_info,PAR,test=False):
    # Iterate over all reads
    for line in open(est_filename):
       line = line.strip()
-      chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
+      #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
+      #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
+      id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
+
       splitpos = int(splitpos)
       length = int(length)
       prb = [ord(elem)-50 for elem in prb]
index b01bbdc..8baabc6 100644 (file)
@@ -34,7 +34,7 @@ def parse(gff_fid):
             allGenes[currentGene.id] = currentGene
          
          desc = rx.search(desc).group()[3:-1]
-         print desc
+         #print desc
          currentGene = Gene(chr,start,stop,strand,desc)
 
       elif id == 'five_prime_UTR':
@@ -88,6 +88,15 @@ def parse(gff_fid):
       allGenes[currentGene.id] = currentGene
 
    return allGenes
+
+
+def createGffPickle(annotFile,pickleFile):
+   gff_fid = open(annotFile)
+   pickle_fid = open(pickleFile,'w+')
+   allGenes = parse(gff_fid)
+   #for key,val in allGenes.iteritems():
+      #print key
+   cPickle.dump(allGenes,pickle_fid)
       
 if __name__ == '__main__':
    assert len(sys.argv) >= 3
@@ -96,9 +105,4 @@ if __name__ == '__main__':
    assert os.path.exists(annotFile)
    assert not os.path.exists(pickleFile)
 
-   gff_fid = open(annotFile)
-   pickle_fid = open(pickleFile,'w+')
-   allGenes = parse(gff_fid)
-   for key,val in allGenes.iteritems():
-      print key
-   cPickle.dump(allGenes,pickle_fid)
+   createGffPickle(annotFile,pickleFile)
index 4b295f3..fa04070 100644 (file)
@@ -12,17 +12,6 @@ struct gene {
    char* id;
 };
 
-struct read {
-   int start;
-   int stop;
-   char* seq;
-   char strand;
-
-   int* prb;
-   int* cprb;
-   int* chastity;
-};
-
 struct gene* gene_alloc(void);
 void free_gene(struct gene* oldGene);
 void add_exon(struct gene* currentGene,int start, int stop);
index 284c8f8..55730cf 100644 (file)
@@ -416,6 +416,9 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
 
          remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
 
+         if(up_strand != down_strand)
+            continue;
+
          new_seq[0] = '\0';
          new_prb[0] = '\0';
          new_cal_prb[0] = '\0';
diff --git a/tools/data_tools/read.h b/tools/data_tools/read.h
new file mode 100644 (file)
index 0000000..d2c55b6
--- /dev/null
@@ -0,0 +1,59 @@
+#ifndef __READ_H__
+#define __READ_H__
+
+typedef struct read {
+   int chr;
+   int pos;
+   unsigned long id;
+   char strand;
+   int mismatch;
+   int occurrence;
+   int size;
+   int cut;
+
+   char* seq;
+   char* prb;
+   char* cal_prb;
+   char* chastity;
+} Read;
+
+void init_read(struct read *r, int size) {
+   r->chr         = 0;
+   r->pos         = 0;
+   r->id          = 0;
+   r-> strand     = ' ';
+   r->mismatch    = 0;
+   r->occurrence  = 0;
+   r->size        = size;
+   r->cut         = 0;
+
+   r->seq      = malloc(sizeof(char)*(r->size));
+   r->prb      = malloc(sizeof(char)*(r->size));
+   r->cal_prb  = malloc(sizeof(char)*(r->size));
+   r->chastity = malloc(sizeof(char)*(r->size));
+}
+
+void create_read(Read* newRead, int chr,int pos, char* seq, unsigned long id, char strand, int mismatch, int occurrence, int size, int cut, char* prb, char* cal_prb, char* chastity) {
+   newRead->chr         = chr;
+   newRead->pos         = pos;
+   newRead->id          = id;
+   newRead-> strand     = strand;
+   newRead->mismatch    = mismatch;
+   newRead->occurrence  = occurrence;
+   newRead->size        = size;
+   newRead->cut         = cut;
+
+   newRead->seq      = malloc(sizeof(char)*(size));
+   newRead->prb      = malloc(sizeof(char)*(size));
+   newRead->cal_prb  = malloc(sizeof(char)*(size));
+   newRead->chastity = malloc(sizeof(char)*(size));
+
+   strncpy(newRead->seq,seq,size);
+   strncpy(newRead->prb,prb,size);
+   strncpy(newRead->cal_prb,cal_prb,size);
+   strncpy(newRead->chastity,chastity,size);
+}
+
+
+
+#endif // __READ_H__