#!/usr/bin/perl # gogenedetail.pl =item info create tables per GO id of individual gene contributions (not all!) and goid x species total species x gene-go association crosstables (obs - exp gene counts) input for significant GO groups: egstats9/genes6-dmel12-fruitfly-v9g2/index-summary-crosstab.txt input for go term info: ../gocatall.tab input for GO x GeneID crossref: gene-gop2-mod*.tab (for GO combined groups) gene-go2-$mod.tab (for GO-gene groups) input for GOgroup x species counts: crosstab.txt has it, source is g2o-spp-$mod.tab input for GeneID x species counts: gene-spp-$mod.tab =cut use strict; my $dotxt2html= 1; my %gounknown= map{$_,1} qw(GO:0005554 GO:0000004 GO:0008372); # renamed main class [alt id] my %skipcol = ( dper => 1); # outlier my $nout; my $root = "/Users/gilbertd/Desktop/dspp-work/eugenomestats/egstats9/"; my $datadir = "genes6-dmel12-fruitfly-v9g2/"; my $datapatt = '^genes6.*v9g3$'; #v9g2 v9g3 my $inf = "index-summary-crosstab.txt"; my $gocatf= "../gocatall.tab"; my $genesppf= 'gene-spp-mod\w*.tab$'; my $genegocrossf= 'gene-gop3-mod\w+.tab$'; #gop2 gene-gop3b-mod my %thisproteomegene = ( # for genegocrossf filter fruitfly => '^CG\d+', worm => '^WBGene\d+', mouse => '^MGI:\d+', ); my $thisproteome= "fruitfly"; my $outname = "genedetail"; ## genedetail-GOid.txt my $outh = *STDOUT; my ($rowhead,@colhead,@coltot,@gainloss, $goid,@cnt,@exp,@chi,@res,$done,$l_goclass); my (%gocat,%gnspp,%gngotab,@specieslist); # set by readgenespp unless ($dotxt2html) { warn "# reading $gocatf\n"; %gocat= readgocat($root.$gocatf); opendir(D,$root); my @genespp= grep(/$genesppf/,readdir(D)); closedir(D); warn "# reading @genespp\n"; open(F,"cat $root".join(" $root",@genespp)." |"); %gnspp= readgenespp(*F); close(F); opendir(D,$root); my @genego= grep(/$genegocrossf/,readdir(D)); closedir(D); warn "# reading @genego\n"; open(F,"cat $root".join(" $root",@genego)." |"); %gngotab= readgenego(*F); close(F); } opendir(D,$root); my @datadir= grep(/$datapatt/,readdir(D)); closedir(D); foreach $datadir (@datadir) { ##next unless($datadir =~ /dmel/ && $datadir =~ /fruitfly/); next if($datadir =~ /go-v9g2/); # tests # $minchi= ($datadir =~ /cele/) ? 5 : 3; ($thisproteome)= $datadir =~ m/(fruitfly|worm|mouse)/; if ($dotxt2html) { my $txtin="$datadir/$outname-goid.txt"; warn "# reading $txtin; writing to $datadir/$outname-*.html\n"; txt2html($root.$txtin, $root.$datadir); next; # next datadir } ## ?? one file for all, 1-per-goclass? or 1 file/goid ? (about 50+) warn "# reading $datadir/$inf; writing to $datadir/$outname-goid.txt\n"; my $outfn="$root/$datadir/$outname-goid.txt"; open(OUT,">$outfn") or die "$outfn"; $outh= *OUT; $done= $nout= 0; @gainloss=(); print $outh " Gene Detail for Function Associations in Species Genomes Counts of gene matches for GO categories by species genomes. for this grouping: $datadir/ "; my $cell; open(IN,"$root/$datadir/$inf"); while() { chomp; my @val = split(/\s*\|\s*/); my $val1= shift(@val); $val1=~ s/^\s+//; $val1 =~ s/\s+$//; if(/Row Total/) { # header $rowhead= $val1; @colhead= @val; pop(@colhead); $done=0; ## printhead(); } elsif(/Column Total/) { # end table @coltot= @val; $done=1; } elsif (/^\s*(GO:\d+)/) { # cell count $goid= $val1; @cnt = @val; $cell=1; } elsif ($cell==1) { @exp= @val; $cell++; } elsif ($cell==2) { @chi= @val; $cell++; } elsif ($cell==3) { @res= @val; $cell++; } elsif (/^\-\-/) { printcell($goid,\@chi,\@res,\@cnt,\@exp) if($cell); $cell= 0; @chi=@res=@cnt=@exp=(); } if($done) { printfoot() if(@gainloss); } } } sub printhead { # printfoot() if(@gainloss); print $outh "\n"; # print $outh sprintf("%-12s",$rowhead), map { sprintf(" %-6s",$_) }@colhead; # print $outh sprintf(" %4s","Ng"); # print $outh " ","Term and Grouping"; # print $outh "\n"; } sub printfoot { # if(@gainloss) { # print $outh sprintf("%-12s","Over/Under"); # print $outh map { # my($g,$l)=( $gainloss[$_]{"+"}, $gainloss[$_]{"-"}); # sprintf(" %-7s","+$g/-$l") ## sprintf("+%d/-%d",$g,$l) # } (0..$#gainloss); # print $outh sprintf(" %4s"," "); # print $outh " ","Relative amount of over- and under-representation in groups (not gene counts)"; # print $outh "\n"; # @gainloss=(); # } } sub printcell { my($goid,$rchi,$rres)= @_; # ($goid,\@chi,\@res,\@cnt,\@exp) my @out= @cnt; my $rowtot= pop(@out); # map{"."} @chi; my $maxchi; #?? my $outf= "$outname-$goid.txt"; my $got = $gocat{$goid}; printhead() if($got->{class} ne $l_goclass); ## should be if($goclass ne $l_goclass); $l_goclass= $got->{class}; my $thisproteomegene= $thisproteomegene{$thisproteome}; my @gogenes= grep(/$thisproteomegene/, sort keys %{$gngotab{$goid}} ); # ^^ now has genes for all source proteomes; want only 1 source ; ## really want to resort these by chi value/differences my $ngenes= @gogenes; # not $avexp; my $gnote=""; my $ngeneshown= $ngenes; ## now long/short table of gogenes if($ngenes > 50) { $gnote=" (50 listed)"; $ngeneshown=50; } print $outh sprintf("%-12s",""); print $outh $goid," Ngenes=$ngenes$gnote,"; # print $outh sprintf(" Ngenes=%d",$ngenes); # want true gene num here print $outh " ",$got->{term},", ",$got->{class}, ":",$gocat{ $got->{grandparent} }->{term} if(ref $got); print $outh "\n"; print $outh sprintf("%-12s","GO/Gene ID"), map { sprintf(" %4s",$_) }@colhead; ## == species list print $outh "\n"; print $outh sprintf("%-12s",$goid), map { sprintf(" %4d",$_) }@out; print $outh "\n"; my $si= 0; my %sppindx = map { $_ => $si++; } @specieslist; my %inspp2dat= map { $_ => $sppindx{$_}; } @colhead; foreach my $ig (0..$ngeneshown-1) { ## ? order by chi ? (obs - exp)^2/exp my $geneid= $gogenes[$ig]; unless($gnspp{$geneid}) { print $outh sprintf("%-12s",$geneid)," ** missing data **\n"; next; } my @gnc = @{$gnspp{$geneid}}; # @specieslist order my @gncount= map { my $si= $inspp2dat{$_}; (defined $si ? $gnc[$si] : 0); } @colhead; print $outh sprintf("%-12s",$geneid), map { sprintf(" %4d",$_) }@gncount; print $outh "\n"; } print $outh "-"x75,"\n\n"; $nout++; } # geneID goID sub readgenego { my($fh)= @_; my(@hd,%tab); while(<$fh>){ chomp; my @v= split; my $gn= shift @v; if($gn =~ /^site/) { @hd= @v; } # header my $go= shift @v; $tab{$go}{$gn}++; ## $tab{$go}= $gn; #oops got many gn/go, many go/gn } return %tab; } # site species1 species2 .... speciesN sub readgenespp { my($fh)= @_; my(@hd,%tab); ##open(F,$fn) or die "$fn"; while(<$fh>){ chomp; my @v= split; my $gn= shift @v; if($gn =~ /^site/) { @specieslist=@v; } # header # foreach my $sp (0..$#v) { # $v[$sp]= $maxgenedup if($v[$sp]>$maxgenedup); ## limit max match/gene # } $tab{$gn}= \@v; } return %tab; } # goid parent class term sub readgocat { my($fn)= @_; my (@hd); # %gocat, open(F,$fn) or die "$fn"; while(){ chomp; my @val= split"\t"; unless(@hd) { @hd=@val; } else { my $id= $val[0]; my %gov; @gov{@hd}= @val; $gov{term} =~ s/\[alt\]/unknown/ if($gounknown{$id}); $gocat{$id}= \%gov; } } gograndparents(); return %gocat; } sub gograndparents { foreach my $id (keys %gocat) { my ($lpar,$llpar); for(my $pid = $id; $pid; ) { $llpar= $lpar; $lpar= $pid; my $prow= $gocat{$pid}; last unless($prow && $prow->{parent}); ($pid)= split(/,/, $prow->{parent}); } $gocat{$id}->{grandparent}= $llpar if($llpar); } } # reformat this with hyperlinks to gene reports, gbrowse map view of gene matches sub txt2html { my($fn,$todir)= @_; my %idurls =( GO => "http://eugenes.org/fbservlet/goreport/id=", fruitfly=>"/flybase/cgi-bin/fbidq.html?", mouse=>"/mgi/idlookup/", worm=>"/wormbase/idlookup/", yeast=>"/sgd/idlookup/", ); my %gn2spp =( "GO:" => "GO", "CG" => "fruitfly", "WBGene" => "worm", "MGI:" => "mouse"); #my $urlhspmap = "http://melon.bio.indiana.edu/cgi-bin/gbrowse/"; # my $urlhspmap = "http://insects.eugenes.org/species/cgi-bin/gbrowse/"; my $urlhspmap = "/species/cgi-bin/gbrowse/"; my %spp2mapspp= map{ $_ => $urlhspmap.$_."/?name=HSP:"; } ("cele", "dpul", "dmel","dsim","dsec","dyak","dere","dana","dper","dpse","dwil","dmoj","dvir","dgri"); $spp2mapspp{"dpul"} = "http://wfleabase.org/cgi-bin/gbrowse/dpulex/?name=HSP:"; $spp2mapspp{"cele"} = ""; # nothing? open(F,$fn); my $fh= *F; my($goid,$html,@species,$intab,$lab); while(<$fh>){ # chomp; if(m/(GO:\d+)\s+Ngenes=/){ # GOid table header $goid=$1; $html= "$goid gene detail\n"; $html.="
\n";
      $html.= $_;
    } elsif(s,^GO/Gene ID,GO/Gene_ID,) { # dang space
      my @v=split;
      my $lab= shift(@v); 
      @species= @v;
      $intab=1;  
      $html.= $_;
    } elsif(m/^\-\-/) {
      $intab=0;
      $html.= $_;
      $html.= "
\n"; if($goid) { my $outf="$outname-$goid.html"; $outf =~ s,[:],,g; open(H,">$todir/$outf"); print H $html; $html=""; close(H); } } elsif($intab && m/^\w/) { my @v= split; my $gn= shift(@v); # $goid= $gn if($gn =~ /^GO/); my($t)= $gn =~ m/^(\D+)/; my $hgn= $idurls{$gn2spp{$t}} . $gn; my $isp=0; s,$gn,$gn,; if($gn !~ /^GO:/) { ## for nasty gbrowse-lucene failure 'MGI:000' ':' is dropped, drop 'MGI:' here $gn =~ s/^MGI://; foreach my $v (@v) { chomp($v); if($v>0) { my $hmap= $spp2mapspp{$species[$isp]}.$gn.'_*'; s,(\s)$v(\s),$1$v$2,; } $isp++; } } $html.= $_; } # print $_; #?? write to separate genedetail-$goid.html *** } } =item index-brief add this: head -7 index-summary.html | tail -5 |\ cat - abbreviated-crosstab.txt | perl -pe\ 'BEGIN{print"Gene Function GO Associations in Species Genomes
\n";}\
print "
" if(/Gene Function A/); s,^GO:(\d+),GO:$1,;\
END{print"
\n";}'\ > index-brief.html =cut =item r-table.html url.hspmap <- "http://melon.bio.indiana.edu/cgi-bin/gbrowse/" url.GOlookup <- "http://eugenes.org/fbservlet/goreport/id=" idurls <- list( fruitfly="/flybase/cgi-bin/fbidq.html?", mouse="/mgi/idlookup/", worm="/wormbase/idlookup/", yeast="/sgd/idlookup/", ) ## need rewriting here: search?q=fban-SYM:(a+b+c) mgi?id=a,b,c worm?xxx=a+b+c ? listurls <- list( fruitfly="/flybase/lucegene/search?q=fban-SYM:", mouse="/mgi/idlist/", worm="/wormbase/idlist/", yeast="/sgd/idlist/", ) =cut