# gene-coastats9.R ## this variant reads go-spp-mod.tab instead of gene-spp-mod.tab ## drop/change ggo usage; sites here are GOid not GeneID # /bio/bio-grid/dspp/blstats3/predhsp library("ade4") # correspondence analysis sppabbrev <- as.matrix( c( "C", "W", "W", "M", "m", "s", "c", "y", "e", "a", "r", "p", "w", "j", "v", "g")) row.names(sppabbrev) <- c("cele", "dpulex", "dpul", "agam", # "dmel","dsim","dsec","dyak","dere","dana","dper","dpse","dwil","dmoj","dvir","dgri")# protdbs <- c( "modMM", "modDM", "modCE", "modSC") protmat <- as.matrix( c( "Mouse", "Fruitfly", "Worm", "Yeast")) row.names(protmat) <- protdbs proteomes <- protmat[protdbs,] # #?? need separate COA flyorder and ceorder, or allorder ?? allorder <- c("cele", "dpul", "dmel","dsim","dsec","dyak","dere","dana","dper","dpse","dwil","dmoj","dvir","dgri")# flyorder <- c("dmel","dsim","dsec","dyak","dere","dana","dper","dpse","dwil","dmoj","dvir","dgri")# ceorder <- c("cele", "dpul", "dmel","dpse","dvir");# # modorder <- c( "modDM", "modMM", "modCE") #........ go categories gocats <- read.delim("../gocatall.tab") # id,PARENT,cat,term# names(gocats) <- c("GOid","Parents","Class","Term") row.names(gocats) <- gocats[,1] # GO:id# golabs <- list(P="GO_Process",F="GO_Function",C="GO_CellLocation")# ## fix this mess:# #[1] "nucleobase, nucleoside, nucleotide and nucleic acid metabolism"# ## levels(gocats[,3])[77] <- "nucleic acid metabolism"# # primary species x gene counts correspondence analysis ## do each protdbs modDM, modMM, modCE # for (mod in modorder) { for (mod in "modDM") { cat("#------ Working on ",mod,"---------------------\n") genespp <- read.delim(fn<-paste("go-spp-",mod,".tab",sep=""))# cat("# read ",fn, " dim=",dim(genespp),"\n") # genego <- read.delim(fn<-paste("gene-go-",mod,".tab",sep=""))# # cat("# read ",fn, "dim=",dim(genego),"\n") gspp <- genespp[,-1]; rownames(gspp) <- genespp[,1]# # ggo <- genego[,-1]; rownames(ggo) <- genego[,1]# # names(ggo) <- gsub("\\.",":",names(ggo))# # # ## need gene-go only here? using gene-go1 further category matching # bool.nogo <- (apply(ggo,1,sum)==0)# # #genes with GO.unknown: # # bool.ungo <- ( ggo[,c("GO:0008372","GO:0005554","GO:0000004")]>0 ) # bool.ungo2 <- (apply(bool.ungo,1,sum)>0) # bool.keep <- (!(bool.ungo2 | bool.nogo)) bool.keep <- TRUE for (iorder in 1:2) { ### for (iorder in 1) { if(iorder==1) spporder <- flyorder else spporder <- ceorder spp1 <- spporder[1] ###gspp.go <- gspp[bool.keep,spporder] ###gspp.go[gspp.go > 4] <- 4 # set upper limit to counts, e.g. 4 == 4+ level gspp.go <- gspp[ ,spporder] nspp <- ncol(gspp.go) outf <- paste("gene5go-",mod,"-",spp1,"-coa.out",sep="") cat("# output:",outf,"---------------------","\n") capture.output( cat("#",outf,"\n"), cat("# Filtered data, dim(gspp.go)= ",dim(gspp.go),"\n"), cat("# Proteome:", protmat[mod,],", Species:", paste(names(gspp.go),collapse=","),"\n"), file=outf,append = FALSE) #----- gspp.coa <- dudi.coa(gspp.go, scann = FALSE, nf=nspp-1) ## for GOgroups try PCA, as values are much different (bigger) than gene-spp dataset ## ** PCA no good, gives oddities *** # gspp.coa <- dudi.pca(gspp.go, scann = FALSE, nf=nspp-1) # capture.output( # cat("# USING *** Principle Components dudi.PCA **** (not dudi.COA)\n"), # file=outf,append = FALSE) pclb <- sppabbrev[ row.names(gspp.coa$co),] #.. print summary of gspp.coa: $eig plus spp factors #.. add plot? pdf ok sppdf <- data.frame(matrix(nrow=0,ncol=nspp)) names(sppdf)<-names(gspp.coa$tab) for (ip in 1:gspp.coa$nf) { v<-gspp.coa$co[,ip] sppdf <- rbind(sppdf, trunc(100*(v)/max(abs(v)))) } sppdf[abs(sppdf)<10] <- "." capture.output( cat("# COA eigenvalues:",gspp.coa$eig,"\n"), cat("# COA factors, Species weight, for ", protmat[mod,],"proteome\n"), cat("# ---------------------------------------------------\n"), print(sppdf,digits=1), cat("# ---------------------------------------------------\n"), file=outf,append = TRUE) # write to file gspp.coa **** gspp.coa.l <- gspp.coa class(gspp.coa.l) <- "list" capture.output( gspp.coa.l, file=paste(outf,".data",sep=""), append = FALSE) pdf( file=fn<-paste("gene5go-",mod,"-",spp1,"-coa1.pdf",sep=""), width=8, height=10) nc <- min(5,gspp.coa$nf) plot(gspp.coa$co[,1:nc],pch=pclb,cex=1.5, main=paste( paste(protmat[mod,],"COA species x ngenes"), paste(paste(names(pclb),pclb,sep="="),collapse=", "),sep="\n")) dev.off() capture.output( cat("# plot COA.",1:nc," fn=",fn,"\n"), file=outf,append = TRUE) if(nc < gspp.coa$nf) { nc2 <- min(10,gspp.coa$nf) nc3 <- c(1,nc:nc2) pdf( file=fn<-paste("gene5go-",mod,"-",spp1,"-coa2.pdf",sep=""), width=8, height=10) plot(gspp.coa$co[,nc3],pch=pclb,cex=1.5, main=paste( paste(protmat[mod,],"COA species x ngenes"), paste(paste(names(pclb),pclb,sep="="),collapse=", "),sep="\n")) dev.off() capture.output( cat("# plot COA.",nc3," fn=",fn,"\n"), file=outf,append = TRUE) } #-- want this here or longer list of all GO terms per COA factor ?? ig <- row.names(gspp.coa$li) # ngo <- apply( ggo[ig,],2,sum) ngo <- apply( gspp.go,1,max) # bad? ## names(ngo)<- ig #?? for (ip in 1:gspp.coa$nf) { wt <- gspp.coa$li[ ,ip] # wtgo <- apply( wt * ggo[ig,],2,sum) # mangles names GO: to GO. wtgo <- wt igo <- order(abs(wtgo),decreasing=TRUE) gon1 <- ig[igo] ##? names(ngo[igo]) dfgo1 <- data.frame( GO=gon1, COA_wt=wtgo[igo], Ngenes=ngo[igo], Term=strtrim(as.character(gocats[gon1,"Term"]),30), Kind=as.character(gocats[gon1,"Class"]), # "Class","Term" row.names=NULL ) capture.output( cat("# COA factor.",ip,",", protmat[mod,],"proteome, species x ngene\n"), print(dfgo1[1:15,],digits=2), cat("#-------------------------------------------------------\n\n"), file=outf,append = TRUE) } # end coa list } # end spporder } # end mod