## predhsp-mart.info ## see post-sqldump script at ~/Desktop/dspp-work/eugenomestats/ngene-predhsp.sh ## see also ~/Desktop/dspp-work/genepreds/cdspred.sql -- predhsp.sql select distinct f.chr_name,f.source,f.biotype,f.chr_start,f.chr_end,f.score, CASE WHEN f.chr_strand < 0 THEN '-' ELSE '+' END as strand, f.chr_phase, f.Parent , group_concat(distinct hf.Parent) as gene , round(100 * max( (1 + if(hf.chr_endf.chr_start,hf.chr_start,f.chr_start) ) /(hf.chr_end - hf.chr_start + 1) ) ) as pctgene into outfile '/bio/bio-grid/dspp/blstats3/dmoj-glean-hsp.gfft' from dmoj__features__dm f, dmoj__features__dm hf where f.biotype = 'HSP' -- and f.source in ('modDM') and (hf.type_source = 'CDS_GLEAN') and hf.chr_name = f.chr_name and hf.chr_start <= f.chr_end and hf.chr_end >= f.chr_start group by f.feature_id_key order by f.chr_name,f.chr_start #................................ # edit predhsp.sql for spp table & call mysql set dspp="dana dere dgri dmel dmoj dper dpse dsec dsim dvir dwil dyak" foreach spp ($dspp) if ( -s /bio/bio-grid/dspp/blstats3/$spp-glean-hsp.gfft ) continue cat predhsp.sql | sed -e"s/dmoj/$spp/g" > ! tmphsp.sql mysql drospege_mart_caf1b < tmphsp.sql end #...... create table tmp_feathsp select f.* from dmoj__features__dm f join dmoj__region__main r using (region_id_key) where r.CDS_GLEAN_bool is not NULL and ( r.HSP_modDM_bool is not null or r.HSP_modMM_bool is not null or r.HSP_modCE_bool is not null or r.HSP_modSC_bool is not null ) and (f.biotype = 'HSP' or f.type_source = 'CDS_GLEAN') ; create index feature_id_key on tmp_feathsp(feature_id_key); create index chr_name on tmp_feathsp(chr_name); create index type_source on tmp_feathsp(type_source); create index biotype on tmp_feathsp(biotype); select distinct f.chr_name,f.source,f.biotype,f.chr_start,f.chr_end,f.score, CASE WHEN f.chr_strand < 0 THEN '-' ELSE '+' END as strand, f.chr_phase, f.Parent , group_concat(distinct hf.Parent) as gene , round(100 * max( (1 + if(hf.chr_endf.chr_start,hf.chr_start,f.chr_start) ) /(hf.chr_end - hf.chr_start + 1) ) ) as pctgene into outfile '/bio/bio-grid/dspp/blstats3/dmoj-glean-hsp.gfft' from tmp_feathsp f, tmp_feathsp hf where f.biotype = 'HSP' -- and f.source in ('modDM') and (hf.type_source = 'CDS_GLEAN') and hf.chr_name = f.chr_name and hf.chr_start <= f.chr_end and hf.chr_end >= f.chr_start group by f.feature_id_key order by f.chr_name,f.chr_start ; drop table tmp_feathsp; #......................................... // hspoverlap cds == min(hf.chr_start,f.chr_start) perl filter : 1. duplicate (same loc, hspID, geneID, pctgene) == 2+ glean exons/hsp >> join as one line, record glean exon-count xx 2. 2+ glean genes/hsp == poor glean model (usually) >> join glean gene parts x? 3. 2+ hsps/glean gene == poor hsp id 3a. same hsp-gene, 2+ _parts >> join as one hsp-part * only if ~same location? 3b. diff hsp-gene >> pick "best" hsp-gene, as main, record others? # r s t b e p o f h g a scaffold_6500 modDM HSP 30737896 30738402 259.00000 -1 0 CG9935_G1 GLEAN_00744 100 scaffold_6500 modDM HSP 30737896 30738402 259.00000 -1 0 CG9935_G1 GLEAN_00744 100 scaffold_6500 modDM HSP 30737717 30737824 72.40000 -1 0 CG9935_G1 GLEAN_00744 100 scaffold_6500 modDM HSP 30736573 30737031 261.00000 -1 0 CG9935_G1 GLEAN_00744 97 scaffold_6500 modDM HSP 30666815 30668797 535.00000 -1 0 CG5102_G1 GLEAN_00745 97 #------- ## step 1 filter................. # grep scaffold_6500 ## setenv spp "dmoj"; \ setenv spp "dere"; \ setenv mod modDM setenv mod modMM setenv mod modCE ## -- if not grep by mod, sort by mod ?? but cant mangle prior sort order except by mod # cat $spp-glean-hsp.gfft grep $mod $spp-glean-hsp.gfft | perl -ne \ 'chomp; @v= split; @v{qw(r s t b e p o f h g a)}= @v; $v{nx}=1; $ni++; \ ($v{qr} = $v{h}) =~ s/\_\w+$//; \ if(scalar(%lv)) { \ $req= ($v{r} eq $lv{r});\ $eq=1; foreach $a (qw(r s b e h)) { $eq=0 unless($v{$a} eq $lv{$a}); } \ if ($eq && $v{g} ne $lv{g}) { $eq=1; } \ if ($req && $v{qr} eq $lv{qr} && $v{g} ne $lv{g}) { \ $lv{g}.= ",".$v{g} unless($lv{g} =~ m/$v{g}/); $v{g}= $lv{g}; $ngs++; } \ if ($req && $v{qr} ne $lv{qr} && $v{g} eq $lv{g}) { $nhs++; } \ if($eq) { $lv{nx}++; next; } \ else { print join("\t",@lv{qw(r s t b e p o f h g a nx)}),"\n"; $no++; } \ } %lv=%v; @lv=@v; END{ warn"# ngs=$ngs; nhs=$nhs; ni=$ni; no=$no; \n" }'\ > $spp-glean-hsp-$mod.gff2 ## dmoj: ngs=2812; nhs=6621; ni=50472; no=42190; ## dere: ngs=2024; nhs=4113; ni=52110; no=42737; ## dvir: ngs=2286; nhs=6581; ni=50438; no=42205; ## dana: ngs=2640; nhs=5514; ni=52106; no=42797; ## dgri: ngs=2818; nhs=6699; ni=52707; no=44579; ## dpse: ngs=2090; nhs=5727; ni=49528; no=40521; ## dsim: ngs=3929; nhs=4298; ni=54035; no=44623; ## dwil: ngs=2633; nhs=6314; ni=51289; no=42523; ## dyak: ngs=2827; nhs=4075; ni=54912; no=44991; ## dper: ngs=5194; nhs=6602; ni=55902; no=45520; ## dsec: ngs=4177; nhs=3977; ni=55801; no=45680; #--- rewrite as bltab ? for counting ngenes/paralogs ## join by g/gene # species DB query align eval bits exonHSP intronGap len1 # nparalog dist12 len2 dist13 len3 srcid ## step 2 filter................. ## ** aln/len is bad due to overlapping hsps ; track (b,e) ranges ; filter smaller overlapped hsps # grep scaffold_6500 ## setenv spp "dmoj"; \ cat $spp-glean-hsp-$mod.gff2 | perl -ne \ 'BEGIN{print join("\t",qw(species DB queryPG align bits exonHSP len1 nparalog srcid queryHSP)),"\n";}\ chomp; @v= split; @v{qw(r s t b e p o f h g a nx)}= @v; \ $v{ev}=0; $spp=$ENV{spp}; $ni++; \ $valn= $v{aln}= ($v{e}-$v{b}+1) * $v{a}/100; \ $vlen= $v{len}= ($v{e}-$v{b}+1); \ ($v{qr} = $v{h}) =~ s/\_\w+$//; $vqr= $v{qr}; \ if(scalar(%lv)) { \ $req= ($v{r} eq $lv{r}); \ $geq= ($req && ($lv{g} =~ m/\b$v{g}\b/ || $v{g} =~ m/\b$lv{g}\b/)); \ if($geq) { \ $lv{h}.= ",".$v{h} unless($lv{h} =~ m/\b$v{h}\b/);\ map{ $lv{g}.= ",$_" unless($lv{g} =~ m/\b$_\b/); } split(/,/,$v{g}); \ $lv{aln} += $v{aln}; $lv{len} += $v{len}; $lv{p} += $v{p}; \ $lv{nx} += $v{nx}; $lv{nhe}++; next; } \ else { \ print join("\t",$spp,@lv{qw(s g aln p nx len)},1,$lv{r},$lv{h}),"\n"; \ $no++; } \ } %lv=%v; @lv=@v; END{ warn"# ni=$ni; no=$no; \n" }' \ > $spp-glean-hsp-$mod.btab # ^^^ do without ansource/modXX filter, all mod-hsps are joined per pred-gene # problems with source field, multi-mods ... ##-- patch for multi-hsp matches/gene: collect sum-hsp all / pgene, pick longest ## need to know if overlap or not * if($geq) { $lsum{$vqr} += $valn; \ $lpart{$vqr}{len} += $v{len}; $lpart{$vqr}{p} += $v{p}; $lpart{$vqr}{nx} += $v{nx}; \ } else { @lk= keys %lsum; \ if(@lk>1) { \ ($lqr)= sort{$lsum{$b}<=>$lsum{$a}} @lk; \ $lv{aln}= $lsum{$lqr};\ $lv{len}= $lpart{$lqr}{len}; $lv{p}= $lpart{$lqr}{p}; $lv{nx}= $lpart{$lqr}{nx};\ $lv{h}= join",", grep(/$lqr/, split(",",$lv{h}) ); \ } \ ## print ... %lsum=(); $lsum{$vqr} += $valn; \ %lpart=(); $lpart{$vqr}{len} += $v{len}; $lpart{$vqr}{p} += $v{p}; $lpart{$vqr}{nx} += $v{nx};\ } # print join("\t",$spp,@lv{qw(s g aln ev p nx)},qw(NA NA 1 NA NA NA NA),$lv{r},$lv{h}),"\n"; \ ## step 3 filter.............. ## reduce to list of dupl. anno gene IDs + GO categories # drop # cat $spp-glean-hsp-moddm.btab | sort -k10,10 | perl -ne\ # '@v=split; $vp=$v[9]; $vp=~s/\_.*$/_/; $v[-1].="\n"; if($lv[9] =~ m/^$vp/){print join("\t",@v);} @lv=@v;'\ # > $spp-glean-hsp-moddm.dupg # drop # cat $spp-glean-hsp-moddm.dupg | perl -ne \ # '@v=split; @g= map{s/\_.*//; $_; } split(",",$v[9]); print join("\n",@g),"\n";'\ # | sort | uniq -c | sort -k1,1nr > $spp-glean-hsp-moddm.dupg.count ## $spp-glean-hsp-moddm.gcount << as dupg.count + 1st gene matches # keep cat $spp-glean-hsp-$mod.btab | perl -ne\ '@v=split; ($sp,$mod)= ($v[0],$v[1]); %g= map{s/\_.*//; $_,1; } split(",",$v[9]); \ print join("\n",map{ "$_\t$sp\t$mod"} keys %g),"\n";'\ | sort | uniq -c | sort -k1,1nr \ > $spp-glean-hsp-$mod.gcount # cat $spp-glean-hsp-moddm.dupg | perl -ne \ #'@v=split; @g= map{s/\_.*//; $_; } split(",",$v[9]); print join("\n",@g),"\n";' \ #| sort | uniq > $spp-glean-hsp-moddm.dupg.list #wc # 1517 1517 11400 dmoj-glean-hsp-moddm.dupg.list # ggrep -Fw -f $spp-glean-hsp-moddm.dupg.list $bg/dspp/blstats2/modDM.go | perl -ne \ #'chomp; @v=split"\t"; print "$v[3] $v[4]:$v[5]\n";' | sort -k2,2 | uniq -c # >> only 700 matched... fixme ## > redo here to list nparalog / GOclass # cat $bg/dspp/blstats2/modDM.go $spp-glean-hsp-moddm.dupg.count | perl -ne\ # 'chomp; if(/GO:/){ @v=split"\t"; ($p=$v[1]) =~ s/\-.*//; $go{$p} .= "\t$v[3] $v[4]:$v[5]";} \ # else { @v=split; $go= $go{$v[1]}; print "$_$go\n"; }' \ # > $spp-glean-hsp-moddm.dupg.gocount # dmoj: 1103 match GO: terms; 414 no-GO # dere: 730 GO:, 288 no-NO ## make table like this ... add indiv goterms,,, # ------> match/ngeneall-gof.tab.gz <------ # species goclass goid goterm ansource n_protid # agam F GO:0000166 GO:0000166 modCE 2214 # agam F GO:0000166 GO:0000166 modDM 1867 # agam F GO:0000166 GO:0000166 modMM 2858 ## need add counts of 1st gene match cat $bg/dspp/blstats2/$mod.go $spp-glean-hsp-$mod.gcount | perl -ne\ 'BEGIN{$spp= $ENV{spp}; print join("\t",qw(species goclass goid goterm ansource n_protid)),"\n"; } \ chomp; if(/GO:/){ @v=split"\t"; \ ($p=$v[1]) =~ s/\-.*//; $gop{$p} .= "$v[3],"; \ $gc=(split"",uc($v[4]))[0]; $go{$v[3]}= "$v[0]:$gc:$v[5]"; \ } else { \ @v=split; $spp=$v[2]; $mod=$v[3]; @goi= split(/,/,$gop{$v[1]}); \ push(@goi,"NA") unless(@goi); \ foreach (@goi) { $gopar{$_}+= $v[0]; $gogene{$_}.= "$v[1],"; } \ } END{ foreach $g (sort keys %go) { $np=$gopar{$g}||0; $cg=$gogene{$g}; \ ($mod,$gc,$gd)= split(/:/,$go{$g}); \ ($mod,$gc,$gd)=qw(NA NA NA) unless($gc || $gd); \ print join("\t",$spp,$gc,$g,$gd,$mod,$np),"\n"; \ } }' > $spp-glean-hsp-$mod.gotab ....... ==> dyak-glean-hsp-modDM.gcount <== 12 CG7208 dyak modDM 10 CG7824 dyak modDM cat *modDM.gcount | grep -v queryHSP | perl -ne'($n,$g,@x)=split; $g=~s/^CG//; print"$g\n";' | sort -n | uniq | sed -e's/^/CG/' > gene.list cat d*modDM.gcount | egrep -v 'query|Gyc' | sed -e's/^ *[0-9]* CG//' -e's/ .*$//' | sort -k1,1n | uniq | sed -e's/^/CG/' > ! gene2.list # make gene-site x species table for corresp analysis cat gene.list *modDM.gcount | grep -v queryHSP | perl -ne\ 'chomp; if(/^CG/){ push(@gn,$_); } else { ($n,$g,$s,$m)=split; $sp{$s}++; $gn{$g}{$s}=$n; }\ END{ @s= sort keys %sp; print join("\t","site",@s),"\n"; \ foreach $g (@gn) { print "$g"; foreach $s (@s) { $v=$gn{$g}{$s}||0; print "\t$v"; } \ print "\n"; } }' \ # ?? this loses spp x GO/gene info ... but GO is gene attribute, not species # make gene-site x GO phenotype table for coa cat gene.list $bg/dspp/blstats2/$mod.go | grep -v queryHSP | perl -ne\ 'chomp; if(/^CG/){ push(@gn,$_); } \ elsif (/GO:/) { ($m,$g,$go1,$gos,$goc,$t)=split"\t"; $g=~s/\-..$//; $gos{$gos}++; $gn{$g}{$gos}=1; } \ END{ @s= sort keys %gos; print join("\t","site",@s),"\n"; \ foreach $g (@gn) { print "$g"; foreach $s (@s) { $v=$gn{$g}{$s}||0; print "\t$v"; } \ print "\n"; } }' \ setenv mod modMM ; cat *-$mod.gcount | egrep -v 'query|Gyc' | sed -e's/^ *[0-9]* MGI://' -e's/ .*$//' \ | sort -k1,1n | uniq | sed -e's/^/MGI:/' > ! gene-$mod.list ## try also specific GO-gene table cat gene-$mod.list $bg/dspp/blstats2/$mod.go | grep -v queryHSP | perl -ne \ 'chomp; if(/^(CG|MGI)/){ push(@gn,$_); } \ elsif (/GO:/) { ($m,$g,$go1,$gos,$goc,$t)=split"\t"; $g=~s/\-..$//; $gos{$gos}++; $gn{$g}{$gos}=1; } \ END{ @s= sort keys %gos; print join("\t","site",@s),"\n"; \ foreach $g (@gn) { print "$g"; foreach $s (@s) { $v=$gn{$g}{$s}||0; print "\t$v"; } \ print "\n"; } }' \ > ! gene-go-$mod.tab cat gene-$mod.list *-$mod.gcount | grep -v queryHSP | perl -ne \ 'chomp; if(/^(CG|MGI)/){ push(@gn,$_); } else { ($n,$g,$s,$m)=split; $sp{$s}++; $gn{$g}{$s}=$n; }\ END{ @s= sort keys %sp; print join("\t","site",@s),"\n"; \ foreach $g (@gn) { print "$g"; foreach $s (@s) { $v=$gn{$g}{$s}||0; print "\t$v"; } \ print "\n"; } }' \ > ! gene-spp-$mod.tab ## specific GO-gene table >> 4094 GO ids/colums ! way too many.. # cat gene-$mod.list $bg/dspp/blstats2/$mod.go | grep -v queryHSP | perl -ne \ # 'chomp; if(/^(CG|MGI)/){ push(@gn,$_); } \ # elsif (/GO:/) { ($m,$g,$go1,$gos,$goc,$t)=split"\t"; $g=~s/\-..$//; $gos{$go1}++; $gn{$g}{$go1}=1; } \ # END{ @s= sort keys %gos; print join("\t","site",@s),"\n"; \ # foreach $g (@gn) { print "$g"; foreach $s (@s) { $v=$gn{$g}{$s}||0; print "\t$v"; } \ # print "\n"; } }' \ # > ! gene-go1-$mod.tab ## another try, dont make matrix, but 2-column list (db style) of gene-go associations (all go/gene including heirarchy) cat gene-$mod.list $bg/dspp/blstats2/$mod.go | grep -v queryHSP | perl -ne \ 'chomp; if(/^(CG|MGI)/){ push(@gn,$_); } \ elsif (/GO:/) { ($m,$g,$go1,$gos,$goc,$t)=split"\t"; $g=~s/\-..$//; $gn{$g}{$gos}++; $gn{$g}{$go1}++; } \ END{ print join("\t","site","goid"),"\n"; \ foreach $g (@gn) { @gngo= sort keys %{$gn{$g}}; foreach $v (@gngo) { print "$g\t$v\n"; } \ } }' \ > ! gene-go1-$mod.tab ## only primary go terms / gene cat gene-$mod.list $bg/dspp/blstats2/$mod.go | grep -v queryHSP | perl -ne \ 'chomp; if(/^(CG|MGI|WBGene)/){ push(@gn,$_); } \ elsif (/GO:/) { ($m,$g,$go1,$gos,$goc,$t)=split"\t"; $g=~s/\-..$//; $gn{$g}{$go1}++; } \ END{ print join("\t","site","goid"),"\n"; \ foreach $g (@gn) { @gngo= sort keys %{$gn{$g}}; foreach $v (@gngo) { print "$g\t$v\n"; } \ } }' \ > ! gene-go2-$mod.tab ## geneontology.obo to genecat.tab #GO:0005794 C Golgi apparatus #GO:0005618 C cell wall #?? want is_a, part_of parent GO ids, synonyms?, old ids ; ? drop this is_obsolete from list? # need alt_id set; write as extra rows cat $go/gene_ontology.obo | perl -ne 'chomp;\ m/^id: (GO:\d+)/ and $id=$1; \ m/^name: (.+)/ and $nm=$1;\ m/^namespace: (\w+)_(\w)(\w+)/ and $cat=uc($2); \ m/^is_obsolete: true/ and $dead=1;\ m/^alt_id: (GO:\d+)/ and push(@alt,$1); \ m/^is_a: (GO:\d+)/ and push(@isa,$1); \ m/part_of (GO:\d+)/ and push(@isa,$1); \ m/^\s*$/ and do{ $pids=join(",",@isa); \ if($id && $dead==0){ print join("\t",$id,$pids,$cat,$nm),"\n"; \ foreach $a (@alt) { print join("\t",$a,$pids,$cat,$nm." [alt]"),"\n";}} \ ($id,$cat,$nm,$dead,@alt,@isa)=(); };' \ ## > gocatall.tab