#!/usr/bin/perl # crosstabrief.pl =item info summarize species x gene-go association crosstables (obs - exp gene counts) input: egstats9/genes6-dmel12-fruitfly-v9g2/index-summary-crosstab.txt Cell Contents |-------------------------| | Count | | Expected Values | | Chi-square contribution | | Std Residual | |-------------------------| output: ... index-crossbrief.txt ... species: dmel dsim ... goterm/ GO:0005554 +++ ++ + . - -- --- < +/- sign of deviation width = chi-square levels : >2, >5, >9, >14, >20 (>5? >10? >20?) =cut # print "\n"; # my @gainloss=(); # for $i (0..9) { for $v ("-","+") { $gainloss[$i][$v] ++; } } # print sprintf("%-12s","Gain/Loss"), map { # my($g,$l)=( $gainloss[$_]["+"], $gainloss[$_]["-"]); # sprintf(" %7s",sprintf("+%d/-%d",$g,$l)) # } (0..$#gainloss); # print "\n"; # exit; use strict; # @chilev = (2,5,9,14,19); # @chilev = (1.5,3,6,10,17); # @chilev = (1,2,5,9,15); my @chilev = (0.6,2,5,9,15); my $minchi = 3; # skip if row max(chi) < minchi ## use 4+ for cele-dpul-dros ; 3 for dros-only ?? my %gounknown= map{$_,1} qw(GO:0005554 GO:0000004 GO:0008372); # renamed main class [alt id] my %skipcol = ( dper => 1); # outlier my $nout; my $root = "/Users/gilbertd/Desktop/dspp-work/eugenomestats/egstats9/"; my $datadir = "genes6-dmel12-fruitfly-v9g2/"; my $datapatt = '^genes6.*v9g3$'; #v9g2 v9g3 my $inf = "index-summary-crosstab.txt"; my $gocatf= "../gocatall.tab"; my $outf = "abbreviated-crosstab.txt"; my $outh = *STDOUT; my ($rowhead,@colhead,@coltot,@gainloss, $goid,@cnt,@exp,@chi,@res,$done,$l_goclass); my %gocat= readgocat($root.$gocatf); opendir(D,$root); my @datadir= grep(/$datapatt/,readdir(D)); closedir(D); foreach $datadir (@datadir) { ##next unless($datadir =~ /dmel/); $minchi= ($datadir =~ /cele/) ? 5 : 3; warn "# reading $datadir/$inf; writing to $datadir/$outf\n"; my $outfn="$root/$datadir/$outf"; open(OUT,">$outfn") or die "$outfn"; $outh= *OUT; $done= $nout= 0; @gainloss=(); print $outh " Gene Function Associations in Species Genomes Deviations in gene matches for GO categories by species genomes. Summary of table: $datadir/$inf Minimum Chi^2 per row: $minchi "; my $cell; open(IN,"$root/$datadir/$inf"); while() { chomp; my @val = split(/\s*\|\s*/); my $val1= shift(@val); $val1=~ s/^\s+//; $val1 =~ s/\s+$//; if(/Row Total/) { # header $rowhead= $val1; @colhead= @val; pop(@colhead); $done=0; ## printhead(); } elsif(/Column Total/) { # end table @coltot= @val; $done=1; } elsif (/^\s*(GO:\d+)/) { # cell count $goid= $val1; @cnt = @val; $cell=1; } elsif ($cell==1) { @exp= @val; $cell++; } elsif ($cell==2) { @chi= @val; $cell++; } elsif ($cell==3) { @res= @val; $cell++; } elsif (/^\-\-/) { printcell($goid,\@chi,\@res,\@cnt,\@exp) if($cell); $cell= 0; @chi=@res=@cnt=@exp=(); } if($done) { printfoot() if(@gainloss); } } } sub printhead { #print join("\t",$rowhead,@colhead),"\n"; printfoot() if(@gainloss); print $outh "\n"; print $outh sprintf("%-12s",$rowhead), map { sprintf(" %-6s",$_) }@colhead; print $outh sprintf(" %4s","Ng"); print $outh " ","Term and Grouping"; print $outh "\n"; } sub printfoot { ## perl GOclass sum of ++/-- groupings per species # my(@gainloss)=@_; if(@gainloss) { print $outh sprintf("%-12s","Over/Under"); print $outh map { my($g,$l)=( $gainloss[$_]{"+"}, $gainloss[$_]{"-"}); sprintf(" %-7s","+$g/-$l") ## sprintf("+%d/-%d",$g,$l) } (0..$#gainloss); print $outh sprintf(" %4s"," "); print $outh " ","Relative amount of over- and under-representation in groups (not gene counts)"; print $outh "\n"; @gainloss=(); } } sub printcell { my($goid,$rchi,$rres)= @_; my @out= map{"."} @chi; my $maxchi; my $got = $gocat{$goid}; ##printhead() if($nout>0 && $gounknown{$goid}); ## should be if($goclass ne $l_goclass); printhead() if($got->{class} ne $l_goclass); ## should be if($goclass ne $l_goclass); $l_goclass= $got->{class}; my @gains=(); for my $i (0..$#chi) { my $v= ($res[$i]<0) ? "-" : "+"; my $chi= $chi[$i]; $maxchi=$chi if($maxchi<$chi && ! $skipcol{$colhead[$i]} ); my $nv= 0; for my $n (0..$#chilev) { $nv= $n+1 if($chi > $chilev[$n]); } $out[$i]= $v x $nv if($nv); ## $gains[$i]{$v} += ($nv-1) if($nv>1); # ^^ should this use obs - exp counts ?? or count of groups? $gains[$i]{$v} += 1 if($nv>1); } return if($maxchi < $minchi); my $avexp=0; map{ $avexp += $_; } @exp; $avexp= int($avexp/scalar(@exp)); for my $v ( "-", "+") { for my $i (0..$#chi) { $gainloss[$i]{$v} += $gains[$i]{$v}; } } #print join("\t",$goid,@out),"\n"; print $outh sprintf("%-12s",$goid), map { sprintf(" %-6s",$_) }@out; print $outh sprintf(" %4d",$avexp); print $outh " ",$got->{term},", ",$got->{class}, ":",$gocat{ $got->{grandparent} }->{term} if(ref $got); print $outh "\n"; $nout++; } # goid parent class term sub readgocat { my($fn)= @_; my (@hd); # %gocat, open(F,$fn); while(){ chomp; my @val= split"\t"; unless(@hd) { @hd=@val; } else { my $id= $val[0]; my %gov; @gov{@hd}= @val; $gov{term} =~ s/\[alt\]/unknown/ if($gounknown{$id}); $gocat{$id}= \%gov; } } gograndparents(); return %gocat; } sub gograndparents { foreach my $id (keys %gocat) { my ($lpar,$llpar); for(my $pid = $id; $pid; ) { $llpar= $lpar; $lpar= $pid; my $prow= $gocat{$pid}; last unless($prow && $prow->{parent}); ($pid)= split(/,/, $prow->{parent}); } $gocat{$id}->{grandparent}= $llpar if($llpar); } } =item rtest chisq.test(c(716,717,728,665,658,646,686,579,652,645,643,719)) chisq.test(c(17,20,21,19,21,19,40,35,25,18,15,22)) chisq.test(c(17,20,21,19,21,19,19,19,25,18,15,22)) =cut