#!/usr/bin/perl # gene2gotab2.pl # gene-go0 == go-slim top categories # gene-go1 == both 0,2 # gene-go2 == gene-go detail assignments # set mod=modDM; cat ../gocatall.tab gene-go2-$mod.tab | perl gene2gortab.pl -g gene-spp-$mod.tab \ # > g2o-spp-$mod.tab my (%gotab,%goids,%gocat,%gopar,%gnspp, @spp); my $maxgenedup= 7; ## see distr below; # limit hsp's/gene to this max #my $mingo=30; $maxLoop=15; ## this gives 650 GO groups #my $mingo=20; $maxLoop=15; ## this gives 900 GO groups my $mingo=10; $maxLoop=15; ## this gives 1400 GO groups, but better species diffs ? ## note: ^ both values (small grps, larger) have meaning depending on gene/go annotation & class ## perhaps best way to combine gene-groups would be with chitest to see if combining maintains/improves distinctions ## rather than homogenizes differences. if($ARGV[0] =~ m/^-g/) { ## -g gene-spp.tab data shift(@ARGV); my $fn= shift(@ARGV); open(F,$fn); while(){ chomp; my @v= split; my $gn= shift @v; if($gn =~ /^site/) { @spp=@v; } # header foreach my $sp (0..$#v) { $v[$sp]= $maxgenedup if($v[$sp]>$maxgenedup); ## limit max match/gene } $gnspp{$gn}= \@v; } } while(<>){ chomp; if (/^GO:\d/) { # gocat table # GO:0008372 C cellular_component unknown # ^^ added GOParent column here (,sep id list) ($go,$gopar,$cterm)=split("\t",$_,3); $gocat{$go}= $cterm; $gopar{$go}= $gopar; } elsif(/\w\sGO:\d/){ # site/goid table ($gn,$go)=split; $gotab{$gn}=[] unless(ref $gotab{$gn}); $goids{$go}++; push(@{$gotab{$gn}},$go); $gopargn{$go}{$gn}++; } } # combine gene counts into base go classes # change here; keep $gn @cnt intact; dont add till output .. sub gocount { my($go)=@_; my @tcnt=(); my $ns= scalar(@spp); # my @gn= keys %{$gospp{$go}}; foreach my $gn (keys %{$gospp{$go}}) { my @gcnt= @{$gospp{$go}{$gn}}; foreach my $i (0..$ns-1) { $tcnt[$i] += $gcnt[$i]; } } return @tcnt; } foreach my $gn (sort keys %gnspp) { my $rcnt= $gnspp{$gn}; ## my @cnt= @{$rcnt}; my @goids= @{$gotab{$gn}}; foreach my $go (@goids) { $gospp{$go}{$gn}= $rcnt; } } # collapse small go cats thru parent GO terms ?? how best to find higher level? foreach my $parentLoop (1..$maxLoop) { foreach my $go (sort keys %gospp) { my @cnt= gocount($go); ## @{$gospp{$go}}; my $cmax=0; map { $cmax=$_ if($cmax<$_); } @cnt; if($cmax < $mingo) { if($parentLoop == $maxLoop) { delete $gospp{$go}; # delete $gopargn{$go}; push(@dropgo,@go); next; } if(my $gopar= $gopar{$go}) { # my @gn= keys %{$gopargn{$go}}; ## this is double-triple counting some genes put in related subcategories ## need to check $gopargn{$pid}{$gn} ## this now is dropping genes; need to merge gospp counts + gene ids my $pid; my %pids=(); my @gopar= split(/,/,$gopar); # which is best? all? my $didpar=0; # foreach $pid (@gopar) { # next unless(ref $gospp{$pid}); # $pids{$pid}++; # $didpar++; # } # unless($didpar) { # $pid= shift @gopar; $pids{$pid}++; $didpar++; # } foreach my $pid (@gopar) { # was keys %pids foreach my $gn (keys %{$gospp{$go}}) { $gospp{$pid}{$gn}= $gospp{$go}{$gn} if(ref $gospp{$go}{$gn}); # wont count gn twice delete $gospp{$go}{$gn}; } } delete $gospp{$go}; # even if (!didpar) ? } } } } print join("\t","site",@spp),"\n"; foreach my $go (sort keys %gospp) { my @cnt= gocount($go); ## @{$gospp{$go}}; print join("\t",$go,@cnt),"\n"; } warn("# writing >gene-gop-modxx.tab\n"); open(GNGO,">gene-gop-modxx.tab"); print GNGO "site\tgoid\n"; foreach my $go (sort keys %gospp) { foreach $gn (sort keys %{$gospp{$go}}) { print GNGO "$gn\t$go\n"; } } close(GNGO);