# gene-coastats9.R # /bio/bio-grid/dspp/blstats3/predhsp library("ade4") # correspondence analysis sppabbrev <- as.matrix( c( "C", "W", "W", "M", "m", "s", "c", "y", "e", "a", "r", "p", "w", "j", "v", "g")) row.names(sppabbrev) <- c("cele", "dpulex", "dpul", "agam", # "dmel","dsim","dsec","dyak","dere","dana","dper","dpse","dwil","dmoj","dvir","dgri")# protdbs <- c( "modMM", "modDM", "modCE", "modSC") protmat <- as.matrix( c( "Mouse", "Fruitfly", "Worm", "Yeast")) row.names(protmat) <- protdbs proteomes <- protmat[protdbs,] # #?? need separate COA flyorder and ceorder, or allorder ?? allorder <- c("cele", "dpul", "dmel","dsim","dsec","dyak","dere","dana","dper","dpse","dwil","dmoj","dvir","dgri")# flyorder <- c("dmel","dsim","dsec","dyak","dere","dana","dper","dpse","dwil","dmoj","dvir","dgri")# ceorder <- c("cele", "dpul", "dmel","dpse","dvir");# # modorder <- c( "modDM", "modMM", "modCE") #........ go categories gocats <- read.delim("../gocat.tab") # id,cat,term# names(gocats) <- c("GOid","Class","Term") row.names(gocats) <- gocats[,1] # GO:id# golabs <- list(P="GO_Process",F="GO_Function",C="GO_CellLocation")# ## fix this mess:# #[1] "nucleobase, nucleoside, nucleotide and nucleic acid metabolism"# levels(gocats[,3])[77] <- "nucleic acid metabolism"# # primary species x gene counts correspondence analysis ## do each protdbs modDM, modMM, modCE for (mod in modorder) { cat("#------ Working on ",mod,"---------------------\n") genespp <- read.delim(fn<-paste("gene-spp-",mod,".tab",sep=""))# cat("# read ",fn, " dim=",dim(genespp),"\n") genego <- read.delim(fn<-paste("gene-go-",mod,".tab",sep=""))# cat("# read ",fn, "dim=",dim(genego),"\n") gspp <- genespp[,-1]; rownames(gspp) <- genespp[,1]# ggo <- genego[,-1]; rownames(ggo) <- genego[,1]# names(ggo) <- gsub("\\.",":",names(ggo))# ## need gene-go only here? using gene-go1 further category matching bool.nogo <- (apply(ggo,1,sum)==0)# #genes with GO.unknown: # bool.ungo <- ( ggo[,c("GO:0008372","GO:0005554","GO:0000004")]>0 ) bool.ungo2 <- (apply(bool.ungo,1,sum)>0) bool.keep <- (!(bool.ungo2 | bool.nogo)) ### for (iorder in 1:2) { for (iorder in 2) { if(iorder==1) spporder <- flyorder else spporder <- ceorder spp1 <- spporder[1] gspp.go <- gspp[bool.keep,spporder] gspp.go[gspp.go > 4] <- 4 # set upper limit to counts, e.g. 4 == 4+ level nspp <- ncol(gspp.go) outf <- paste("gene4go-",mod,"-",spp1,"-coa.out",sep="") cat("# output:",outf,"---------------------","\n") capture.output( cat("#",outf,"\n"), cat("# Filtered data, count-max=4, dim(gspp.go)= ",dim(gspp.go),"\n"), cat("# Proteome:", protmat[mod,],", Species:", paste(names(gspp.go),collapse=","),"\n"), file=outf,append = FALSE) #----- gspp.coa <- dudi.coa(gspp.go, scann = FALSE, nf=nspp-1) pclb <- sppabbrev[ row.names(gspp.coa$co),] #.. print summary of gspp.coa: $eig plus spp factors #.. add plot? pdf ok sppdf <- data.frame(matrix(nrow=0,ncol=nspp)) names(sppdf)<-names(gspp.coa$tab) for (ip in 1:gspp.coa$nf) { v<-gspp.coa$co[,ip] sppdf <- rbind(sppdf, trunc(100*(v)/max(abs(v)))) } sppdf[abs(sppdf)<10] <- "." capture.output( cat("# COA eigenvalues:",gspp.coa$eig,"\n"), cat("# COA factors, Species weight, for ", protmat[mod,],"proteome\n"), cat("# ---------------------------------------------------\n"), print(sppdf,digits=1), cat("# ---------------------------------------------------\n"), file=outf,append = TRUE) # write to file gspp.coa **** gspp.coa.l <- gspp.coa class(gspp.coa.l) <- "list" capture.output( gspp.coa.l, file=paste(outf,".data",sep=""), append = FALSE) pdf( file=fn<-paste("gene4go-",mod,"-",spp1,"-coa1.pdf",sep=""), width=8, height=10) nc <- min(5,gspp.coa$nf) plot(gspp.coa$co[,1:nc],pch=pclb,cex=1.5, main=paste( paste(protmat[mod,],"COA species x ngenes"), paste(paste(names(pclb),pclb,sep="="),collapse=", "),sep="\n")) dev.off() capture.output( cat("# plot COA.",1:nc," fn=",fn,"\n"), file=outf,append = TRUE) if(nc < gspp.coa$nf) { nc2 <- min(10,gspp.coa$nf) nc3 <- c(1,nc:nc2) pdf( file=fn<-paste("gene4go-",mod,"-",spp1,"-coa2.pdf",sep=""), width=8, height=10) plot(gspp.coa$co[,nc3],pch=pclb,cex=1.5, main=paste( paste(protmat[mod,],"COA species x ngenes"), paste(paste(names(pclb),pclb,sep="="),collapse=", "),sep="\n")) dev.off() capture.output( cat("# plot COA.",nc3," fn=",fn,"\n"), file=outf,append = TRUE) } #-- want this here or longer list of all GO terms per COA factor ?? ig <- row.names(gspp.coa$li) ngo <- apply( ggo[ig,],2,sum) for (ip in 1:gspp.coa$nf) { wt <- gspp.coa$li[ ,ip] wtgo <- apply( wt * ggo[ig,],2,sum) # mangles names GO: to GO. igo <- order(abs(wtgo),decreasing=TRUE) gon1 <- names(ngo[igo]) dfgo1 <- data.frame( GO=gon1, COA_wt=wtgo[igo], Ngenes=ngo[igo], Term=strtrim(as.character(gocats[gon1,3]),30), Kind=as.character(gocats[gon1,2]), row.names=NULL ) capture.output( cat("# COA factor.",ip,",", protmat[mod,],"proteome, species x ngene\n"), print(dfgo1[1:15,],digits=2), cat("#-------------------------------------------------------\n\n"), file=outf,append = TRUE) } # end coa list } # end spporder } # end mod ## species gene counts per major COA factors # sppdf <- data.frame(matrix(nrow=0,ncol=12)); names(sppdf)<-flyorder # for (i in names(wtgo)[1:20]) { # iid <- rownames( ggo[ ggo[,i] > 0, ] ) # spsum <- apply( gspp.d.go[ iid,flyorder], 2, sum, na.rm=TRUE) # sppdf <- rbind( sppdf, spsum) # }