## GO terms x species for blast protein match stats setwd("/Users/gilbertd/Desktop/dspp-work/eugenomestats") library("gplots") # load function cim -- below bltab <- read.delim("dsppDMmain.gotab") tabname <- "Protein matches to D. melanogaster (Main Euchromatin, all HSP)" blspp <- bltab$species bloutl <- bltab allorder <- c("cele","dpulex","agam","dmel","dsim","dsec","dyak","dyak3r", "dere","dana","dper","dpse","dpse2r","dwil","dmoj","dvir","dgri") blspp <- factor(bltab$species, levels = allorder, ordered = TRUE) bloutl$species <- blspp blmeans <- aggregate(bloutl[,4:11], list(species = blspp), FUN=mean, na.rm = TRUE) blciw <- aggregate(bloutl[,4:11], list(species = blspp), FUN=cim, na.rm = TRUE) #--- > bltab[1:5,] species DB query align eval bits exonHSP intronGap len1 nparalog srcid GOC 1 dyak modDM CG8236-PA 504 0e+00 771.0 1 NA 1511 1 chr3R F 2 dyak modDM CG8236-PA 504 0e+00 771.0 1 NA 1511 1 chr3R P 3 dyak modDM CG8236-PA 504 0e+00 771.0 1 NA 1511 1 chr3R C 4 dyak modDM CG17917-PA 213 2e-90 418.2 2 265 637 1 chr3R F 5 dyak modDM CG17917-PA 213 2e-90 418.2 2 265 637 1 chr3R P GOID 1 2 3 4 GO:0004872 5 GO:0007165 > names(bltab) [1] "species" "DB" "query" "align" "eval" "bits" "exonHSP" [8] "intronGap" "len1" "nparalog" "srcid" "GOC" "GOID" gvars <- c(4,6,7,8,9,10) goblf <- bltab[bltab$GOC == "F",] goblf <- bltab[bltab$GOC == "P",] goblf <- bltab[bltab$GOC == "C",] aggregate(goblf[,gvars], list(species = factor(goblf$species)), FUN=mean, na.rm = TRUE) aggregate(goblf[,gvars], list(goid = factor(goblf$GOID)), FUN=mean, na.rm = TRUE) gomn <- aggregate(goblf[,gvars], list(species = factor(goblf$species), goid = factor(goblf$GOID)), FUN=mean, na.rm = TRUE) ## bits,nparalog have a few spp x GOID signif interactions nparalog.lm <- lm(nparalog ~ species * GOID - 1, data=goblf) summary(nparalog.lm) >>> GO:0016209 antioxidant dmoj vs dyak nparalog GO:0016209 DNA binding moj/yak nparalog GO:0005773 vacuole nparalog 4 species, nparalog lm anova: speciesdvir:GOIDGO:0009536 4.5000000 1.7250314 2.609 0.009096 ** ?? plastid; is it contam. speciesdyak:GOIDGO:0005773 2.4769231 0.9982608 2.481 0.013100 * vacuole speciesdvir:GOIDGO:0005768 4.0000000 1.9685968 2.032 0.042176 * endosome lm(formula = nparalog ~ species * GOID - 1, data = goblf) Residuals: Min 1Q Median 3Q Max -2.5000 -0.4799 -0.3479 0.2915 82.5201 Coefficients: Estimate Std. Error t value Pr(>|t|) speciesdere 1.3076923 0.3720298 3.515 0.000441 *** speciesdmoj 1.3846154 0.3720298 3.722 0.000198 *** speciesdvir 1.3076923 0.3720298 3.515 0.000441 *** speciesdyak 1.2307692 0.3720298 3.308 0.000940 *** #..... # ^^ need here to aggregate/split also by GOID # want plot x = species, panels == GOID ?? blnames <- names(blmeans) blnames[4] <- "D.mel protein similarity (Bit score)" blnames[3] <- "D.mel protein similarity (missing x Bit score)" blnames[2] <- "D.mel protein Aligned length" blnames[5] <- "N HSP (exons)" blnames[6] <- "Gap (intron) size" blnames[7] <- "Length of transcript" blnames[8] <- "N Duplicates (Paralogs)" blnames[9] <- "Duplicate distance" ## genepred vars #nvar <- c(5,6,7) ## nvar <- c(4,3,2,5,6,7,9) ###nvar <- c(4,2,5,6,7,9) nvar <- c(4,2,5,6,8) blegend <- paste("Figures show genome averages for",tabname, "\nwith 95% confidence intervals. Species include three outgroups (C.elegans, crustacean Daphnia pulex,", "\nAnopholes gambia), and twelve Drosophila species, Dmel .. Dgri, for CAF1 assemblies.") lvar1 <- 1+length(nvar); lmat <- matrix( 1:lvar1, lvar1, 1, byrow=TRUE) nf <- layout(lmat, heights = c(lcm(3.5),rep(1, lvar1-1)) ) plot(c(1,2,1,2),type="n",xlab="",ylab="",axes = FALSE) mtext(blegend,side=3,line=-5) for (i in nvar) { mn <- blmeans[,i] ci <- blciw[,i] yname <- blnames[i] par(mar=c(3,3,1,1)) plotCI( x=mn, uiw=ci, ylab=yname, xlab="Species", gap=0.01, col="black", barcol="blue", xaxt="n") axis(side=1, at=1:nrow(blmeans), labels=as.character(blmeans$species), cex.axis=1.5) lside <- 3 # ifelse( i < 5, 1, 3) mtext(yname, side=lside, line = -2 ) } # cim <- qt(0.975, n) * stdev / sqrt(n) cim <- function (x, na.rm = FALSE) { if (is.matrix(x)) apply(x, 2, cim, na.rm = na.rm) else if (is.vector(x)) { nd <- ifelse(na.rm, sum(!is.na(x)), length(x)) q <- qt(0.975, nd) q * sqrt(var(x, na.rm = na.rm) / nd) } else if (is.data.frame(x)) sapply(x, cim, na.rm = na.rm) else { v <- as.vector(x) nd <- ifelse(na.rm, sum(!is.na(v)), length(v)) q <- qt(0.975, nd) q * sqrt(var(v, na.rm = na.rm) / nd) } } #--------------------------------- # GO crosstabs #--------------------------------- ## from goblstats8.R ## 2006may07: # setwd("/Users/gilbertd/active/gmod04work/eugenomes/eugenomestats/egstats9") # setwd("/Users/gilbertd/Desktop/dspp-work/eugenomestats/egstats9") # library("gplots") # plotCI # library("gmodels") # crosstable gocats <- read.delim("../gocat.tab") # id,cat,term rownames(gocats) <- gocats[,1] # GO:id names(gocats) <- c("GOid","Class","Term") golabs <- list(P="GO_Process",F="GO_Function",C="GO_CellLocation") ## fix this mess: #> levels(gocats[,3])[77] #[1] "nucleobase, nucleoside, nucleotide and nucleic acid metabolism" # levels(gocats[,3])[77] <- "nucleic acid metabolism" protdbs <- c( "modMM", "modDM", "modCE", "modSC") protmat <- as.matrix( c( "Mouse", "Fruitfly", "Worm", "Yeast")) ; row.names(protmat) <- protdbs proteomes <- protmat[protdbs,] dpulexorder <- c( "cele", "dpulex", "dmel", "dyak", "dmoj", "dvir") flyorder <- c("dmel","dsim","dsec","dyak","dere","dana","dper","dpse","dwil","dmoj","dvir","dgri") allorder <- c("cele", "dpulex", "agam", "dmel","dsim","dsec","dyak","dere","dana","dper","dpse","dwil","dmoj","dvir","dgri") fly1order <- c("dmel","dsim","dyak","dere","dana","dpse","dwil","dmoj","dvir","dgri") flyxorder <- c("cele", "dpulex", fly1order) dpulexorder <- c( "dpulex", "dmel","dyak","dpse","dmoj","dvir") ceorder <- c( "cele", "dpulex", "dmel","dpse","dvir"); modorder <- c( "modDM", "modCE", "modMM",) sppabbrev <- as.matrix( c( "Ce", "Wf", "Ag", "Dm", "Ds", "Dc", "Dy", "De", "Da", "Dr", "Dp", "Dw", "Dj", "Dv", "Dg")) row.names(sppabbrev) <- c("cele", "dpulex", "agam", "dmel","dsim","dsec","dyak","dere","dana","dper","dpse","dwil","dmoj","dvir","dgri") version <- "v9" gvers <- "-v9" gvar <- "ngenes"; gvlab <- "Number of gene matches (HSP groups) " # library("gmodels") # crosstable # source("../dassocplot.R") goptab <- read.delim("ngeneall-gop.tab") goptab <- read.delim("ngeneall-gof.tab") goptab <- read.delim("ngeneall-goc.tab") goptab <- read.delim("bitscore-gop.tab") ## this conversion helps bitscores goptab$n_protid <- sqrt(goptab$n_protid + 1) goptab <- read.delim("ngenes-gop.tab") #GOcat <- "GO_Process" goptab <- read.delim("ngenes-gof.tab") #GOcat <- "GO_Function" goptab <- read.delim("ngenes-goc.tab") #GOcat <- "GO_CellLocation" ## CLEANED data, matched hsps to predicted gene to join hsp parts, etc. # melon:/bio/bio-grid/dspp/blstats3/predhsp/ngenes-glean-hsp-moddm.tab.gz goalltab <- read.delim("ngenes-glean-hsp.tab") goptab <- goalltab[goalltab$goclass == "P",] # select out P,F,C #---- 29 Dec 06: *** Gene-Detail GO Associations (g2) give best phylo separation *** source("../goassocplot9.R") go2sppdm <- read.delim("go2-spp-modDM.tab") dxtabs <- go2sppdm[,spporder1] row.names(dxtabs) <- go2sppdm[,1] plotlab <- paste("Species: ",paste(spporder1,collapse=", ")) #---------- processing ----------------- goptab$goclass[ goptab$goclass == FALSE ] <- "F" # fixup for read.delim parser F -> FALSE GOcat <- as.character(golabs[as.character(goptab$goclass[1])]) class(goptab$goterm) <- "character" for (r in 1:nrow(goptab)) { gid <- as.character(goptab$goid[r]) glab <- gocats[( gocats[,1]==gid), 3] if(length(glab)>0) goptab$goterm[r] <- as.character( glab) } goptab$goterm <- factor(goptab$goterm) spporder1 <- dpulexorder spporder1 <- flyorder #spporder1 <- ftestorder sppkey <- length(spporder1) ## fixme gop_x3way <- xtabs(n_protid ~ goterm + ansource + species, goptab, drop.unused.levels=TRUE) gop_x3way <- gop_x3way[,,allorder] gop_x2way <- xtabs(n_protid ~ goterm + species, goptab, drop.unused.levels=TRUE) dxtabs <- gop_x2way[,spporder1] dxtabs[3:8,] ## species x prot ansource gop_x2as <- xtabs(n_protid ~ ansource + species, goptab, drop.unused.levels=TRUE) #............ modDM .... dassocplothtml <- paste( "

Deviations in gene match counts for GO categories by species genomes.

These may indicate where species genes differ in functional categories. Low counts or 'missing genes' may be due to divergence rather than lack; extra gene matches indicate something more is there.\n") # Statistically significant deviations are brightly colored. # Mouse here .. is it clearer than notDM ? yes, merging species confuses the signals pdb <- "modMM" # Worm here pdb <- "modCE" # Fly here pdb <- "modDM" sppkey <- length(spporder1) ## fixme gdir <- paste(gvar,"-",spporder1[1],length(spporder1),"-",GOcat,"-",tolower(protmat[pdb,]),gvers,"/",sep="") linkpage <- paste("index-detail.html",sep="") ## use gdir, not new sdir ... dir.create(gdir) dxtabs <- gop_x3way[,pdb,spporder1] proteomes <- protmat[pdb,] # ( crossfile <- paste(gdir,length(spporder1),"-",GOcat,"-",pdb,"-",version,sep="")) ( crossfile <- paste(gdir,"index-summary",sep="") ) png() goassocplot(dxtabs,GOcat,crossfile,linkpage=linkpage,htmlheader=dassocplothtml, ) # .......... notDM ...... gdir <- paste(gvar,"-",length(spporder1),"-",GOcat,"-","modALL",gvers,"/",sep="") proteomes <- protmat[ protdbs[protdbs != "NA"],] gdir <- paste(gvar,"-",length(spporder1),"-",GOcat,"-","notDM",gvers,"/",sep="") dir.create(gdir) linkpage <- paste("index-detail.html",sep="") ## try all BUT dmel *** YES, do modDM and notDM as two views *** goptab.nodm <- goptab[ (goptab$ansource != "modDM"), ] gop_x2way.nodm <- xtabs(n_protid ~ goterm + species, goptab.nodm, drop.unused.levels=TRUE) dxtabs <- gop_x2way.nodm[,spporder1] proteomes <- protmat[ protdbs[protdbs != "modDM"],] ( crossfile <- paste(gdir,"index-summary",sep="") ) png() goassocplot(dxtabs,GOcat,crossfile,linkpage=linkpage,htmlheader=dassocplothtml, ) # ....... Dpulex ....... spporder1 <- dpulexorder sppkey <- length(spporder1) ## fixme # ......... test 3 species query plots, all dros, dpulex target genomes Function Catalitic activity = GO:0003824 Process Signal transduction = GO:0007165 > modorder <- c( "modMM", "modCE", "modDM") > gop_x3way["GO:0007165",modorder,spporder1] species ansource dmel dsim dsec dyak dere dana dper dpse dwil dmoj dvir dgri modMM 2758 2712 2338 2914 2843 2744 2160 2913 2723 2861 2664 2112 modCE 476 462 411 501 485 458 327 513 466 476 476 421 modDM 1322 1316 1098 1432 1341 1271 1014 1369 1253 1382 1290 1321 attr(,"call") xtabs(formula = n_protid ~ goterm + ansource + species, data = goptab, drop.unused.levels = TRUE) dxtabs <- gop_x3way[,modorder,spporder1] ## use this loglin() margin for mosaicplot: ## means margins in model for goterm, ansource, goterm x ansource, ## but NOT species # BUT See this R graphics plot # too many goterms ... # mosaicplot( ~species + goterm + ansource, data=dxtabs, color=TRUE) mosaicplot( ~ goterm + ansource + species, data=dxtabs[rkeep,,], shade=c(1,2,3,4,5), margin=gamargin) mosaicplot(~ goterm + ansource + species, data=dxall[rkeep[1:7],,], shade=c(1,2,3,4,5), margin=gamargin, type="pearson", main="Deviations in GO PROCESS categories for species gene matches") #..... fly1order <- c("dmel","dsim","dyak","dere","dana","dpse","dwil","dmoj","dvir","dgri") flyxorder <- c("cele", "dpulex", fly1order) ##dpulexorder <- c("cele", "dpulex", "dmel","dyak","dpse","dmoj","dvir") dpulexorder <- c( "dpulex", "dmel","dyak","dpse","dmoj","dvir") ceorder <- c( "cele", "dpulex", "dmel","dpse","dvir"); sppabbrev <- as.matrix( c( "Ce", "Wf", "Ag", "Dm", "Ds", "Dc", "Dy", "De", "Da", "Dr", "Dp", "Dw", "Dj", "Dv", "Dg")) row.names(sppabbrev) <- c("cele", "dpulex", "agam", "dmel","dsim","dsec","dyak","dere","dana","dper","dpse","dwil","dmoj","dvir","dgri") modorder <- c( "modDM", "modCE", "modMM",) gop_x3way <- xtabs(n_protid ~ goterm + ansource + species, goptab, drop.unused.levels=TRUE) gamargin.old <- list(c(1),c(2),c(1,2)) ; gashades <- c(0.2,1.5,2.5,3.5,4.5) dxall <- gop_x3way[,modorder,dpulexorder] rkeep <- (apply(dxall,1,sum) > 1000) (gatitle <- paste("Deviations in ",GOcat," categories for species gene matches",sep="")) pdf( file=paste("mosaic3-ngenes6-",GOcat,".pdf",sep=""), width=60, height=7) pdf( file=paste("mosaic3-ngenes12-",GOcat,".pdf",sep=""), width=60, height=7) mosaicplot(~ goterm + ansource + species, data=dxall[rkeep,,], shade=gashades, margin=gamargin, type="pearson", main= gatitle) dev.off() #.............. gashades <- c(0.2,1.5,2.5,3.5,4.5) gamargin.orig <- list(c(1),c(2),c(1,2)) ; ## ?? add margin for species total (esp. bitscore, but also ngenes)?? gamargin.spp <- list(c(1),c(2),c(1,2) , c(3)) ; ## this one subtracts species x source-proteome margin gamargin.spro <- list(c(1),c(2),c(1,2), c(3), c(2,3)) ; # for species x ansource gop_x2as 2way gamargin.span <- list(c(1),c(2)) ; ## goid was goterm << later got real terms gop_x3way <- xtabs(n_protid ~ goid + ansource + species, goptab, drop.unused.levels=TRUE) dxall <- gop_x3way[,modorder,flyorder] ## dsec,dper are low-gene outliers for main, use for all dxall <- gop_x3way[,modorder,ceorder] # better than dpulex set dxall <- gop_x3way[,modorder,fly1order] # dxall <- gop_x3way[,modorder,dpulexorder] #? rkeep limit? == max(dxsum) / 20 # ** and/or force mimimum width of GO term panels by (a) adjusting counts so # min GO count >= 5% of max? (b) adjust mosaicplotdg to do this? (c) expand plot width ## somthing like: dpsum + ((dpsum * (median(dpsum)/dpsum)) - dpsum ) * 0.1 ## dpsum + ((dpsum * (median(dpsum)/dpsum)) - dpsum ) * 1.0 == all at median ## dpsum + ( (dpsum * (median(dpsum)/dpsum)) - dpsum ) * 0.3 == optimal adjustment? # > range(dpsum) # [1] 1296 27603 # > range(dpsum + ((dpsum * (median(dpsum)/dpsum)) - dpsum ) * 0.2) # [1] 1838 22884 # > range(dpsum + ((dpsum * (median(dpsum)/dpsum)) - dpsum ) * 0.3) # [1] 2110 20524 # > range(dpsum + ((dpsum * (median(dpsum)/dpsum)) - dpsum ) * 0.5) # [1] 2652 15806 tscore <- "bits" rkeep <- (apply(dxall,1,sum) > 100000) # bitscore sum(rkeep) tscore <- "ngenes" rkeep <- (apply(dxall,1,sum) > 1000) sum(rkeep) dxp <- dxall[rkeep,,] dpsum <- apply(dxp,1,sum) dpfac <- (dpsum + 0.3 * ( (dpsum * (median(dpsum)/dpsum)) - dpsum ) ) / dpsum for (i in 1:nrow(dxp)) { dxp[i,,] <- round(dxp[i,,] * dpfac[i]) } # npan selected so row in 5..10 columns npan <- 5 nr <- nrow(dxp) ## fix this add space in last cut for Shade Legend; ~ 1/4 width # dxi <- split(1:nr, cut(1:nr, npan)) ## list dxi <- split(1:nr, cut(1:(nr+2), npan)) # fix for shade legend space in last row source("../mosaicplotdg.R") sppabbrev <- as.matrix( c( "Ce", "Wf", "Wf", "Ag", "Dm", "Ds", "Dc", "Dy", "De", "Da", "Dr", "Dp", "Dw", "Dj", "Dv", "Dg")) row.names(sppabbrev) <- c("cele", "dpulex", "dpul", "agam", "dmel","dsim","dsec","dyak","dere","dana","dper","dpse","dwil","dmoj","dvir","dgri") dimnames(dxp)$species <- sppabbrev[dimnames(dxall)$species,] dimnames(dxp)$ansource <- protmat[dimnames(dxall)$ansource,] (gatitle <- paste("Mosaic Plot: Deviations in ",GOcat," categories for gene matches to genomes",sep="")) sppt <- paste(dimnames(dxall)$species, sppabbrev[dimnames(dxall)$species,], collapse=", ", sep="=") (spptitle <- paste("Target Species (columns):", sppt)) (antitle <- paste("Gene sources (rows):",paste(dimnames(dxp)$ansource, collapse=", "))) goidlist <- paste( dimnames(dxp)$goid, gocats[dimnames(dxp)$goid,3], sep="= ", collapse=", ") goidlist <- paste("GO Term list: ",goidlist) ##goidlist <- strwrap(goidlist,width=120) # width=120 good for pdfwidth=10 goidlist <- strwrap(goidlist,width=200) # width=200 good for pdfwidth=12 gpn <- paste(dimnames(dxall)$species[1],length(dimnames(dxall)$species),sep="") (gpfile <- paste("mosaic5a-",tscore,"-",gpn,"-",GOcat,".pdf",sep="")) pdf( file=gpfile, width=12, height=4*npan) ## use pdf -> png instead, better resolution ## png( file=paste("mosaic3b-ngenes6-",GOcat,".png",sep=""), width=800, height=1000) lmat <- matrix( 1:(npan+2), npan+2, 1, byrow=TRUE) #nf <- layout(lmat) nf <- layout( lmat, heights = c(lcm(2),rep(1, npan),lcm(0.6*length(goidlist))) ) par(mar=c(2,2,1,1)) plot( c(1,2,1,2),type="n",xlab="",ylab="",axes = FALSE) mtext(paste(gatitle,spptitle,antitle,sep="\n"),side=3,line=-4) ## was goterm, changed to goid field for (ip in 1:npan) { doleg <- if(ip == npan) TRUE else FALSE mosaicplot(~ goid + ansource + species, data=dxp[ dxi[[ip]],,], shade=gashades, shadelegend=doleg, margin=gamargin, type="pearson", cex.axis = 1.0, main="") } ## margin w/o goterms // t(dxp) ?? throws of gamargin tho ## use gamargin <- list(c(2)) with t(dxp) to get right orientation ## shows expected fruitfly/Dmel x species bitscores ## and also Dsim is low for mouse, celegans bitscores # gop_x2as <- xtabs(n_protid ~ ansource + species, goptab, drop.unused.levels=TRUE) # dxm <- gop_x2as[modorder,flyorder] # mosaicplot( t(dxm), margin=list(c(2)), # shade=gashades, shadelegend=TRUE, type="pearson", cex.axis = 1.0, main="") nlist <- 1:length(goidlist) plist <- seq(0.5,length(goidlist)+0.5,0.5) par(mar=c(1,1,1,1)) plot( plist, plist, type="n",xlab="",ylab="",axes = FALSE) text(x=0.5,y=rev(nlist),labels=goidlist,adj=0) dev.off() #.............. ## limit to many gene classes rkeep <- c() for (i in 1:nrow(dxtabs)) { if(sum(dxtabs[i,"modDM",]) > 1000) rkeep<-c(rkeep,i) } flyxorder <- c("cele", "dpulex", "dmel", "dsim", "dyak", "dere", "dana", "dpse", "dwil", "dmoj","dvir", "dgri") # drop dper, dsec as general outliers (and agam) #----- norms <- 129865 / apply(dxtabs,2,sum) dxnorm <- dxtabs for(i in modorder) { dxnorm[,i,] <- as.integer(dxnorm[,i,] * norms[i]) } normn <- function(x) { ns[i] * x } apply(dxtabs,2, * ns) #------------------- #--------------------------------- # GO cat gene lists #--------------------------------- source("../goblfunc.R") # arghhhhh, current data set doesnt have missing genes (i.e. ngenes=0) needed for ngenes stats # add for all genes x species, 0 if spporder1[i] is missing gopa <- read.delim("dsppblmain12.gop") # .gof, .gop, .goc ## dropped agam,cele,dpulex from dsppblmain12 gop <- gopa[gopa$DB == "modDM",] gopm <- gopa[gopa$DB == "modMM",] gopc <- gopa[gopa$DB == "modCE",] gopy <- gopa[gopa$DB == "modSC",] #?? drop dsec,dper as outliers ?? gop1 <- gop[ !(gop$species == "agam" | gop$species == "cele" | gop$species == "dpulex" | is.na(gop$GOID) ),] gop1 <- gop[ !( is.na(gop$GOID) ),] ## FIXME: invar <- c(4:11) # was(4:14) ; 4:11 spporder1 <- flyorder ## FIXME: names(gop1)[10] <- "ngenes" # was nparalog ## FIXME: ## patch for missing data (should be there...) for v = ngenes # [1] "species" "DB" "query" "align" "eval" "bits" "exonHSP" # [8] "intronGap" "len1" "ngenes" "dupDist" "len2" "dist13" "len3" #[15] "srcid" "GOC" "GOID" gopadd <- c(); ## data.frame(gop1[0,]) for (db in levels(factor(gop1$DB))) { for (q in levels(factor(gop1$query))) { gid <- levels(factor((gop1$GOID[gop1$query == q])[1])) goc <- levels(factor((gop1$GOC[gop1$GOID == gid])[1])) for (s in levels(gop1$species)) { if(sum(gop1$query == q & gop1$species == s) == 0) { ##slow##gop1 <- rbind(gop1, c(s,db,q,NA,NA,NA,NA,NA,NA,0,NA,NA,NA,NA,NA,goc,gid)) gopadd <- rbind(gopadd, c(s,db,q,NA,NA,NA,NA,NA,NA,0,NA,NA,NA,NA,NA,goc,gid)) } } } } gopadd <- as.data.frame(gopadd) names(gopadd) <- names(gop1) gop2 <- rbind(gop1, gopadd) goclass <- as.character(gop1$GOC[1]); if(goclass == FALSE) goclass <- "F" ; # dang R gop1$species <- factor(gop1$species, levels = spporder1, ordered = TRUE) sppmeans <- aggregate(gop1[,invar], list(Species = gop1$species), FUN=mean, na.rm = TRUE) gidmeans <- aggregate(gop1[,invar], list(Goid = gop1$GOID), FUN=mean, na.rm = TRUE) gidn <- aggregate(gop1[,4], list(Goid = gop1$GOID), FUN=length) # sppsd <- aggregate(gop1[,invar], list(Species = gop1$species), FUN=sd, na.rm = TRUE) gidlist <- levels(gop1$GOID) pdb <- "modDM" proteomeList <- paste(protmat[pdb,],collapse=",") proteome <- tolower(proteomeList) GOcat <- as.character(golabs[goclass]) # == GOcat ( gdir <- paste(gvar,"-",length(spporder1),"-",GOcat,"-",proteome,gvers,"/",sep="") ) dir.create(gdir) ## this should be index.html? # gtabfile <- paste(gdir,gvar,"-",GOcat,"-table.html",sep="") gtabfile <- paste(gdir,"index-detail.html",sep="") noplot <- FALSE cat( htmltop( paste( gvlab," to Gene Ontology classes
in Drosophila species genomes
", "",GOcat," category, ",proteomeList," proteome ") ), itemrefs(gidlist), sep="\n", file=gtabfile) for (gi in gidlist[1:5]) { #gidlist[1:9] gidlist gilab <- paste(gi, as.character(gocats[gocats[,1] == gi,3])) cat( "

",paste("",sep=""), gilab,"

", sep="\n", file=gtabfile, append=TRUE) cat("test ",gilab,"..\n") qbest <- bestgenes(gi, gvar) testv <- qbest$qdev[1,"mindev"] if(is.null(testv)) { cat("test bad\n") cat("Data is too limited to test.\n", file=gtabfile, append=TRUE); } else if(testv < 1.0) { # 0.95 #?? change to 1.0 ? imgf <- paste( gvar,"-mn-",sub(":","",gi), ".png",sep="") #? add version to name? cat("plot ",imgf, "dev=",testv,"..\n") if(!noplot) { png( paste(gdir,imgf,sep=""),width=900,height=300) } sigdif <- plotgenes(gi,gvar,qbest,noplot=noplot) if(!noplot) { dev.off() } if(is.na(sigdif) || sigdif < 0) { cat("No significant difference from population mean.\n"); cat("No significant difference from population mean.\n", "See means plot
\n", file=gtabfile, append=TRUE); cat(" \n", file=gtabfile, append=TRUE); } } } cat( "",file=gtabfile, append=TRUE) #------------------ ## discrim analysis; no good for this data Iris <- data.frame(rbind(iris3[,,1], iris3[,,2], iris3[,,3]), Sp = rep(c("s","c","v"), rep(50,3))) genegotab <- read.delim("genego1.tab") genegox <- xtabs(ndup ~ goids + species + geneid, genegotab, drop.unused.levels=TRUE) > dim(genegox) [1] 120 11 8572 s <- flyorder[1] ggd <- data.frame( t(genegox[,s,]), Sp = rep(s, rep(8572,1))) s <- flyorder[2] ggd2 <- rbind(ggd, cbind( t(genegox[,s,]), Sp = rep(s, rep(8572,1))) ) sp1 <- dimnames(genegox[,,1])$species gi1 <- dimnames(genegox[,1,])$geneid genego.dat <- data.frame( rbind( t(genegox[,1,]), t(genegox[,2,]), t(genegox[,3,]), t(genegox[,4,]), t(genegox[,5,]), t(genegox[,6,]), t(genegox[,7,]), t(genegox[,8,]), t(genegox[,9,]), t(genegox[,10,]), t(genegox[,11,]) ), species = rep(sp1, rep(8572,11)), geneid = rep(gi1, 11) ) > dim(genego.dat) [1] 94292 122 train <- sample(1:94292, 1000) > genego.sub1 <- genego.dat[train,] z <- lda(Sp ~ ., Iris, prior = c(1,1,1)/3, subset = train) z <- lda(species ~ ., genego.sub1) #------------------ # try vegan correspondence analyses # ade4 is better setwd("/Users/gilbertd/Desktop/dspp-work/eugenomestats/egstats9") # library("vegan") library("ade4") #^^ see also ~/Desktop/06summer/rstats/ade4src/R/reconst.R sppabbrev <- as.matrix( c( "C", "W", "W", "M", "m", "s", "c", "y", "e", "a", "r", "p", "w", "j", "v", "g")) row.names(sppabbrev) <- c("cele", "dpulex", "dpul", "agam", "dmel","dsim","dsec","dyak","dere","dana","dper","dpse","dwil","dmoj","dvir","dgri") # r=genes, c=goid, 0/1 values genego <- read.delim("gene-go.tab") > dim(genego) [1] 13209 121 > names(genego) [1] "site" "GO.0000003" "GO.0000004" "GO.0000166" "GO.0000228" "GO.0003676" ... # r=genes, c=species, 0/n values genespp <- read.delim("gene-spp.tab") > dim(genespp) [1] 13209 12 > names(genespp) [1] "site" "dana" "dere" "dgri" "dmoj" "dper" "dpse" "dsec" "dsim" "dvir" "dwil" "dyak" gspp <- genespp[,-1]; rownames(gspp) <- genespp[,1] ggo <- genego[,-1]; rownames(ggo) <- genego[,1] names(ggo) <- gsub("\\.",":",names(ggo)) gspp.dist <- dist(t(gspp)) gspp.hc <- hclust(gspp.dist) gspp.cut <- cutree(gspp.hc,k=3) gspp1 <- gspp[1:100,] ggo1 <- ggo[1:100,] # set max level gspp1[(gspp1 > 3)] <- 3 # ade4 tests g1 <- acm.disjonctif(gspp1) # bb <- acm.burt(gspp1, gspp1) == inner prod g1, g2 ## need to remove no-go rows gyesgo1 <- rownames( ggo1[apply(ggo1,1,sum) > 0,]) # gspp1.ddm <- dudi.mix(gspp1, scann = FALSE) ## this isn't as good as dudi.coa ? # nogo# ggo1.ddm <- dudi.mix(ggo1, scann = FALSE) gspp1.coa <- dudi.coa(gspp1, scann = FALSE) ## this isn't as good as dudi.coa ? ggo1.coa <- dudi.coa(ggo1y, scann = FALSE) # works ## coa bad for coinertia: not same row weights $lw ## try pca, which inputs row weights .. > gspp1.pca <- dudi.pca(gspp1y, scann = FALSE) > gspp1.pca$lw[1:5] [1] 0.001307190 0.001307190 0.001307190 0.001307190 0.001307190 > ggo1.pca <- dudi.pca(ggo1y, scann = FALSE) > ggo1.pca$lw[1:5] [1] 0.001307190 0.001307190 0.001307190 0.001307190 0.001307190 ggospp1.coin <- coinertia(gspp1.pca,ggo1.pca, scan = FALSE, nf = 2) # plots of prcomp gspp.d.pca <- dudi.pca(gspp.d, scann=FALSE, nf=10) pclb <- sppabbrev[rownames(gspp.dro.pca$co),] pclbt <- paste(paste(names(pclb),pclb,sep="="),collapse=", ") plot(gspp.dro.pca$co[,1:4],pch=pclb,cex=1.5, main=paste("PCA species x ngenes, GO groups",pclbt,sep="\n")) #. test genespp.modDM w/o GO unknown (or no-GO) genes # need gene-go-modDM.tab, ../gocat.tab dgbook% grep unk ../gocat.tab GO:0008372 C cellular_component unknown GO:0005554 F molecular_function unknown GO:0000004 P biological_process unknown genespp.m <- read.delim("gene-spp-modMM.tab") genego.m <- read.delim("gene-go-modMM.tab") genespp.d <- read.delim("gene-spp-modDM.tab") genego.d <- read.delim("gene-go-modDM.tab") # converts GO:n to GO.n ?? gspp <- genespp.d[,-1]; rownames(gspp) <- genespp.d[,1] ggo <- genego.d[,-1]; rownames(ggo) <- genego.d[,1] names(ggo) <- gsub("\\.",":",names(ggo)) #genes without GO: log.nogo <- (apply(ggo,1,sum)==0) #genes with GO.unknown: log.ungo <- ( ggo[,c("GO:0008372","GO:0005554","GO:0000004")]>0 ) log.ungo2 <- (apply(log.ungo,1,sum)>0) log.keep <- (!(log.ungo2 | log.nogo)) # > n=7749 of 13340 gspp.d.go <- gspp[log.keep,] # Try truncated counts: max(4)? ** works ** gspp.d.go[gspp.d.go > 4] <- 4 gspp.d.go.pca <- dudi.pca(gspp.d.go, scann=FALSE, nf=10) plot(gspp.d.go.pca$co[,1:4],pch=pclb,cex=1.5, main=paste("PCA species x ngenes, GO groups",pclbt,sep="\n")) ... pca weights few genes with many matches high () ... pca eig 2,3 show best dros species phylo clusters (dmel+, dpse+dper, dmoj+dvir+dgri) .. Genes CG31611,.. have many cross matches to similar genes PC2 == Dmel/.. vs Dpse/Dper PC3 == Dvir/moj/gri vs Dpse/Dper > gspp.d.go.pca$eig [1] 5.0135987 1.3662827 1.0172301 0.7906558 0.6828586 0.6547318 0.5548502 0.4989402 0.4366518 [10] 0.3358832 0.3357165 0.3126006 > a2 <- order(abs(gspp.d.go.pca$li[,2]),decreasing=TRUE) > a3 <- order(abs(gspp.d.go.pca$li[,3]),decreasing=TRUE) > (gspp.d.go.pca$li[a1,])[1:10,1:4] Axis1 Axis2 Axis3 Axis4 CG31618 56.73108 -23.374168 5.2745906 9.9907857 CG31613 51.30680 -17.048150 12.1602909 16.6620970 CG31617 45.96426 -16.928844 5.4252022 10.4285390 CG3379 26.56561 6.475141 15.4529923 -12.5037090 CG9614 26.30546 -4.803061 -2.3625138 -0.1446511 CG12052 24.43942 2.005862 -0.9075465 -0.5335803 CG31611 20.62975 -23.537057 -8.4765554 20.4806855 CG16858 20.32432 6.674266 0.9766208 -0.8707096 CG5929 17.58002 -7.660439 -5.5884469 -1.7128644 CG18660 15.54068 -1.326493 -1.1521542 -2.9344657 > (gspp.d.go.pca$li[a2,])[1:20,1:4] Axis1 Axis2 Axis3 Axis4 CG31611 20.629755 -23.537057 -8.476555 20.4806855 CG31618 56.731082 -23.374168 5.274591 9.9907857 CG31613 51.306802 -17.048150 12.160291 16.6620970 CG31617 45.964265 -16.928844 5.425202 10.4285390 CG16983 10.796822 15.343033 -8.393321 2.5196553 CG7740 13.048294 14.361580 -1.956365 5.7862384 CG17686 12.548494 -13.442596 -8.843974 1.7422532 CG16719 7.852367 13.143768 -9.907043 4.2767111 CG8642 15.337652 11.108076 6.651566 4.9763287 CG7439 6.768344 9.432867 -11.193754 3.2958427 CG1200 4.723482 9.218441 -9.501277 6.3749190 CG8174 8.128664 -9.062387 -6.023523 -4.1298405 CG10726 4.724282 8.925162 -8.546644 4.3669428 CG17927 8.651004 -8.508913 -1.106002 6.0434634 CG6577 3.718076 8.399228 -8.797432 3.3356821 CG1404 10.329458 8.052476 -4.943493 0.9676516 CG17737 4.847968 8.035588 -5.546870 3.1891548 CG18420 4.092624 -7.853961 -5.197479 -0.3923082 CG18241 7.375327 7.730427 -7.019203 -6.8616730 CG10862 3.426851 7.724512 -8.232890 3.3232165 > (gspp.d.go.pca$li[a3,])[1:20,1:4] Axis1 Axis2 Axis3 Axis4 CG3379 26.565607 6.475141 15.452992 -12.5037090 CG31613 51.306802 -17.048150 12.160291 16.6620970 CG7439 6.768344 9.432867 -11.193754 3.2958427 CG1710 7.140153 2.154641 11.055006 2.5626408 CG1826 8.800084 6.917978 -10.852512 2.1005287 CG16719 7.852367 13.143768 -9.907043 4.2767111 CG1200 4.723482 9.218441 -9.501277 6.3749190 CG9520 12.430093 3.618760 9.273966 -0.5119244 CG17686 12.548494 -13.442596 -8.843974 1.7422532 CG6577 3.718076 8.399228 -8.797432 3.3356821 CG10726 4.724282 8.925162 -8.546644 4.3669428 CG31611 20.629755 -23.537057 -8.476555 20.4806855 CG15436 4.794525 6.385144 -8.433624 2.5256488 CG16983 10.796822 15.343033 -8.393321 2.5196553 CG12754 6.436395 3.877559 8.374718 2.3919013 CG10862 3.426851 7.724512 -8.232890 3.3232165 CG6538 5.327396 3.245759 7.620748 1.7634763 CG5912 3.652966 6.463355 -7.258773 3.4159590 CG18241 7.375327 7.730427 -7.019203 -6.8616730 CG10652 3.464510 6.570222 -6.915190 3.3157401 .... PC2uniq: CG7740 CG8174 CG17927 CG1404 CG17737 CG18420 PC2 == Dmel/.. vs Dpse/Dper > gspp.d.go[pc2g,flyorder] dmel dsim dsec dyak dere dana dper dpse dwil dmoj dvir dgri CG1404 1 2 2 3 2 3 9 8 3 4 4 2 CG17737 1 1 1 1 1 1 9 7 2 2 2 2 CG17927 14 3 3 2 3 3 2 1 1 1 2 3 CG18420 3 6 7 2 3 1 1 1 0 0 0 0 CG7740 1 1 1 1 1 2 11 11 3 6 7 7 CG8174 1 15 9 1 1 1 1 1 1 1 2 1 PC3uniq: CG1710 CG1826 CG9520 CG15436 CG12754 CG6538 PC3 == Dvir/moj/gri vs Dpse/Dper > gspp.d.go[levels(pc3g),flyorder] dmel dsim dsec dyak dere dana dper dpse dwil dmoj dvir dgri CG12754 1 1 1 1 1 1 2 1 2 8 3 10 CG15436 1 1 1 3 2 1 10 7 0 0 2 1 CG1710 1 2 1 1 1 2 1 0 1 1 1 26 CG1826 1 4 6 1 1 1 10 10 4 1 1 2 CG6538 1 1 1 1 1 1 1 1 1 14 2 1 CG9520 1 1 3 4 2 3 2 2 4 7 3 17 PC2+PC3: CG7439 CG17686 CG16719 CG1200 CG10726 CG6577 CG18241 CG10862 > gspp.d.go[pc23g,flyorder] dmel dsim dsec dyak dere dana dper dpse dwil dmoj dvir dgri CG10726 1 1 1 1 1 1 13 7 1 1 1 2 CG10862 1 1 1 1 1 1 9 8 1 0 2 0 CG1200 1 0 1 1 1 1 23 1 1 1 1 1 CG16719 1 1 1 1 1 1 11 13 3 2 3 1 CG17686 3 8 24 1 1 2 2 2 1 1 2 1 CG18241 1 2 2 4 1 1 7 7 14 0 0 0 CG6577 1 1 1 1 1 1 9 9 1 1 1 0 CG7439 1 2 2 2 1 1 15 8 2 1 1 1 goids <- names(ggo) goids.pc3 <- gsub("\\.",":", goids[ ( apply(ggo[pc3g,]>0,2,sum) > 0 ) ] ) gocats[goids.pc3,2:3] # ?? want count of genes/category ?? .... try corresp. anal. for categories ... *** This looks better **** gspp.d.go.coa <- dudi.coa(gspp.d.go, scann = FALSE, nf=10) > (gspp.d.go.coa$eig) [1] 0.041218857 0.031537838 0.023290136 0.017583959 0.016371367 0.015792268 0.013052163 [8] 0.011758224 0.010587591 0.008716868 0.007144527 pclb <- sppabbrev[rownames(gspp.d.go.coa$co),] plot(gspp.d.go.coa$co[,1:4],pch=pclb,cex=1.5, main=paste("COA species x ngenes, GO groups", paste(paste(names(pclb),pclb,sep="="),collapse=", "),sep="\n")) ... PC1 + PC2 == 3 main phylo groups separated PC1 == +Dmel/sibs vs others (Dana middle) ^^dmel genes found only in dmel, siblings .. see also w/ genes.modMM PC2 == +Dpse/Dper vs Dvir/moj/gri PC3 == +Dwil vs -Dgri PC4 == +Dmel vs others PC5 == +Dvir/Dmoj vs -Dgri/Dwil PC6 == +Dsim/Dsec vs -Dana PC7 == +Dana vs -Dyak PC8 == +Dsim -Dsec PC9 == +Dpse -Dper PC10== +Dmoj -Dvir ii <- apply( abs(gspp.d.go.coa$li), 2, order, decreasing=TRUE) # a1 <- order(abs(gspp.d.go.coa$li[,1]), decreasing=TRUE) ## cog <- names(gspp.d.go.coa$li[ ii[1:10,3],1]) co3g <- rownames(gspp.d.go.coa$li[ii[1:100,3],]) co2g <- rownames(gspp.d.go.coa$li[ii[1:100,2],]) co1g <- rownames(gspp.d.go.coa$li[ii[1:100,1],]) PC3 uniqg: dana dere dgri dmel dmoj dper dpse dsec dsim dvir dwil dyak CG5197 1 1 1 1 1 1 1 2 1 1 21 1 > (gspp.d.go.coa$li[a1,])[1:15,1:4] Axis1 Axis2 Axis3 Axis4 CG1075 1.446993 0.04782293 -0.3986317 2.907830 CG5878 1.446993 0.04782293 -0.3986317 2.907830 CG9807 1.446993 0.04782293 -0.3986317 2.907830 CG11051 1.446993 0.04782293 -0.3986317 2.907830 CG15864 1.446993 0.04782293 -0.3986317 2.907830 CG18853 1.446993 0.04782293 -0.3986317 2.907830 CG31054 1.446993 0.04782293 -0.3986317 2.907830 CG31687 1.446993 0.04782293 -0.3986317 2.907830 CG31822 1.446993 0.04782293 -0.3986317 2.907830 CG31953 1.446993 0.04782293 -0.3986317 2.907830 CG32077 1.446993 0.04782293 -0.3986317 2.907830 CG32283 1.446993 0.04782293 -0.3986317 2.907830 CG32671 1.446993 0.04782293 -0.3986317 2.907830 CG32673 1.446993 0.04782293 -0.3986317 2.907830 CG32678 1.446993 0.04782293 -0.3986317 2.907830 ^^^ lots with same weight == Dmel only ... fix? > gspp.d.go[co1g,flyorder] dmel dsim dsec dyak dere dana dper dpse dwil dmoj dvir dgri CG1075 1 0 0 0 0 0 0 0 0 0 0 0 CG5878 1 0 0 0 0 0 0 0 0 0 0 0 CG9807 1 0 0 0 0 0 0 0 0 0 0 0 CG11051 1 0 0 0 0 0 0 0 0 0 0 0 CG15864 1 0 0 0 0 0 0 0 0 0 0 0 CG18853 1 0 0 0 0 0 0 0 0 0 0 0 CG31054 1 0 0 0 0 0 0 0 0 0 0 0 CG31687 1 0 0 0 0 0 0 0 0 0 0 0 CG31822 1 0 0 0 0 0 0 0 0 0 0 0 CG31953 1 0 0 0 0 0 0 0 0 0 0 0 CG32077 1 0 0 0 0 0 0 0 0 0 0 0 CG32283 1 0 0 0 0 0 0 0 0 0 0 0 CG32671 1 0 0 0 0 0 0 0 0 0 0 0 CG32673 1 0 0 0 0 0 0 0 0 0 0 0 CG32678 1 0 0 0 0 0 0 0 0 0 0 0 > (gspp.d.go.coa$li[a2,])[1:15,1:4] Axis1 Axis2 Axis3 Axis4 CG18290 -1.21834398 2.181440 -0.5289369 0.280682545 CG32581 -0.95308473 1.644596 -0.2584034 0.052181557 CG1200 -0.84910114 1.444354 -0.3483338 0.227629058 CG1710 -0.66230100 -1.145888 -1.0191933 -0.001849589 CG16736 -0.20534858 1.137960 -0.2498729 0.074055084 CG6577 -0.59519540 1.124157 -0.1462305 0.046282530 CG10862 -0.58132217 1.119792 -0.1455446 0.061282204 CG17450 0.11432448 1.114631 -0.4637843 1.594256054 CG7439 -0.56707193 1.082969 -0.1572052 0.021530721 CG11512 -0.10552043 1.062549 -0.2563033 0.167856128 CG10726 -0.68651220 1.060161 -0.2867744 0.088537568 CG15150 -0.08206601 1.015516 -0.2384856 0.033895671 CG30002 -0.98165797 1.006482 0.7234955 0.223665518 CG31773 -0.65168275 1.005817 -0.3050046 0.120368147 CG33246 -0.80104720 -1.000887 -0.3333971 -0.139303864 > gspp.d.go[co2g,flyorder] dmel dsim dsec dyak dere dana dper dpse dwil dmoj dvir dgri CG18290 0 0 0 0 0 0 1 0 0 0 0 0 CG32581 0 0 0 0 0 1 4 2 0 0 0 0 CG1200 1 0 1 1 1 1 23 1 1 1 1 1 CG1710 1 2 1 1 1 2 1 0 1 1 1 26 CG16736 1 1 1 1 1 1 6 1 0 0 0 0 CG6577 1 1 1 1 1 1 9 9 1 1 1 0 CG10862 1 1 1 1 1 1 9 8 1 0 2 0 CG17450 1 0 0 0 0 0 1 0 0 0 0 0 CG7439 1 2 2 2 1 1 15 8 2 1 1 1 CG11512 1 1 0 1 0 0 2 2 0 0 0 0 CG10726 1 1 1 1 1 1 13 7 1 1 1 2 CG15150 1 1 1 1 1 0 3 3 0 0 0 0 CG30002 0 0 0 0 0 0 1 1 1 0 0 0 CG31773 1 1 1 1 1 1 13 3 1 1 1 2 CG33246 0 0 0 0 0 0 0 0 0 1 0 0 > (gspp.d.go.coa$li[a3,])[1:15,1:4] Axis1 Axis2 Axis3 Axis4 CG31809 -0.805019008 -0.5889729 2.8631436 0.516632847 CG30031 -0.521677779 -0.4151642 2.1202171 0.176148030 CG5197 -0.450400123 -0.3582679 1.7316506 0.291178821 CG14718 -0.277589301 -0.2759403 1.4486865 0.252143848 CG10245 -0.201282630 -0.4170832 1.4337543 0.443857679 CG8425 -0.431380512 -0.3701936 1.4270637 0.266402908 CG31960 -0.785888896 -0.7002635 1.2650460 0.297359576 CG6098 -0.324075637 -0.2453215 1.1460722 0.210380783 CG7118 -0.321294489 -0.4302586 1.1220746 0.148595101 CG9976 -0.351267898 0.0626286 1.1068804 0.239412732 CG9500 -0.379664543 0.1073942 1.0582976 0.166707627 CG1710 -0.662300998 -1.1458885 -1.0191933 -0.001849589 CG14219 -0.003967043 -0.1305222 1.0042418 0.143766727 CG2065 0.072457702 -0.3086720 0.9980903 -0.004692832 CG18241 -0.396853010 0.4537257 0.9079431 0.085121142 > gspp.d.go[co3g,flyorder] dmel dsim dsec dyak dere dana dper dpse dwil dmoj dvir dgri CG31809 0 0 0 0 0 0 0 0 1 0 0 0 CG30031 0 0 0 0 0 1 0 0 2 0 0 0 CG5197 1 1 2 1 1 1 1 1 21 1 1 1 CG14718 1 1 1 1 1 1 1 0 10 1 1 0 CG10245 1 0 0 1 0 1 0 0 4 1 0 0 CG8425 1 1 1 1 2 1 1 1 16 1 1 2 CG31960 0 0 0 0 0 0 0 0 1 0 1 0 CG6098 1 1 1 1 1 1 1 1 9 1 1 1 CG7118 1 2 1 1 1 1 0 1 11 2 3 1 CG9976 2 1 1 1 1 4 3 3 12 2 0 0 CG9500 1 2 2 1 2 0 4 3 13 1 2 0 CG1710 1 2 1 1 1 2 1 0 1 1 1 26 CG14219 1 1 1 1 0 1 0 1 4 0 1 0 CG2065 1 2 1 1 0 3 0 0 5 0 0 1 CG18241 1 2 2 4 1 1 7 7 14 0 0 0 ...... # Try truncated counts: max(4)? ** works ** # Correspondence for 5 classes: 0,1,2,3,4+ matches gspp.d.go[gspp.d.go > 4] <- 4 gspp.d.go4.coa <- dudi.coa(gspp.d.go, scann = FALSE, nf=11) pclb <- sppabbrev[rownames(gspp.d.go4.coa$co),] plot(gspp.d.go4.coa$co[,1:5],pch=pclb,cex=1.5, main=paste("COA species x ngenes(0..4), GO groups", paste(paste(names(pclb),pclb,sep="="),collapse=", "),sep="\n")) plot(gspp.d.go4.coa$co[,c(1,6:10)],pch=pclb,cex=1.5, main=paste("COA species x ngenes(0..4), GO groups", paste(paste(names(pclb),pclb,sep="="),collapse=", "),sep="\n")) PC1= +Dmel/sibs vs -others (Dvir/.., Dpse/Dper) PC2= +Dgri/Dvir/Dmoj/Dwil -Dpse/Dper PC3= +Dgri -Dwil PC4= +Dmel -Dsim -others PC5= +Dyak/Dana -Dsim/Dwil PC6= +Dmoj/Dvir -Dgri PC7= +Dana -Dyak PC8= +Dsim/Dyak/Dmel -Dsec PC9= +Dper -Dpse PC10= +Dmoj -Dvir ii <- apply( abs(gspp.d.go4.coa$li), 2, order, decreasing=TRUE) # ** add sign of coa$li to indicate +/- effect for (ip in 1:2) { # gspp.d.go4.coa$nf itop <- ii[1:15,ip] print( paste("COA",ip,sep="#")) print( gspp.d.go[ rownames(gspp.d.go4.coa$li[ itop,]),flyorder] * sign(gspp.d.go4.coa$li[ itop,ip] ) ) } sppdf <- data.frame(matrix(nrow=0,ncol=12)); names(sppdf)<-flyorder for (ip in 1:gspp.d.go4.coa$nf) { sppdf <- rbind(sppdf, trunc(100*(v<-gspp.d.go4.coa$co[flyorder,ip])/max(abs(v)))) } sppdf[abs(sppdf)<10] <- "." print(sppdf) pupagn carbestgn pupagndf <- data.frame(matrix(nrow=0,ncol=length(carbestgn))); names(pupagndf)<-carbestgn for (ip in 1:gspp.d.go4.coa$nf) { pupagndf <- rbind(pupagndf, trunc(100*(v<-gspp.d.go4.coa$li[carbestgn,ip])/max(abs(v)))) } pupagndf[abs(pupagndf)<10] <- "." print(pupagndf) # DMEL gene COA components dmel dsim dsec dyak dere dana dper dpse dwil dmoj dvir dgri 1 99 94 89 75 50 . -63 -55 -59 -77 -76 -89 +Dmel+sibs -Dgri/Dvir/Dmoj/Dwil/Dpse/Dper 2 . . . . . . -100 -78 17 46 38 63 -Dpse/Dper +Dgri/Dvir/Dmoj 3 17 14 . . . -38 21 . -100 15 14 51 -Dwil +Dgri/vir/moj 4 100 -56 -23 -15 . . . . . . . . +Dmel -Dmelsibs 5 -27 -80 -31 100 30 88 -15 . -63 . . . +Dyak/Dana -Dsim/Dwil 6 . -13 15 . 11 -13 . . -28 87 63 -100 +Dmoj/Dvir -Dgri 7 12 33 -12 -70 -16 99 . -11 -29 . . . +Dana/ -Dyak/ 8 16 58 -100 39 -17 -10 . . . 20 . -12 -Dsec/ +Dsim/ 9 . . 12 . . . 73 -100 . 31 -30 . -Dpse +Dper 10 . . . -10 . . -21 36 . 78 -100 . -Dvir +Dmoj 11 . . -29 -30 100 -10 . . . . . . +Dere # MOUSE gene COA (neg of DMEL COA; e.g. mouse genes high where dmel genes low?) dmel dsim dsec dyak dere dana dper dpse dwil dmoj dvir dgri 1 -94 -98 -92 -67 -54 . 100 90 39 55 57 57 -Dmel-sibs +Non-Dmel 2 15 19 14 . . -15 100 89 -40 -58 -76 -64 +Dpse/Dper -Dgri/Dvir/Dmoj 3 . -12 . . . 39 . . 99 -33 -42 -22 +Dwil -Dvir/Dmoj 4 14 . . 14 . -100 . . 51 66 10 -65 -Dana/Dgri +Dmoj/Dwil/Dvir 5 25 30 . -16 -19 -88 . . 34 -62 -20 100 +Dgri -Dana/Dmoj 6 -31 -74 -12 100 57 -27 . . . -30 . 17 +Dyak/Dere -Dsim 7 . . 14 -15 . . . . 11 -74 100 -39 +Dvir -Dmoj 8 -95 . 100 -30 25 -14 . . . . . . +Dsec -Dmel 9 -86 100 -36 68 -55 . 15 -18 . . 13 . +Dsim/Dyak -Dmel/Dere 10 23 -45 57 40 -99 . . . . . . . -Dere/Dsim +Dsec/Dyak 11 . . . 11 -15 . -83 100 . . . . +Dpse -Dper ...... itop <- ii[1:500,ip]; ig <- rownames(gspp.m.go4.coa$li[ itop,]) wt <- gspp.m.go4.coa$li[ itop,ip] ## want not ggo count as weight (fixed by gene-go relation) ## but coa$li weight and/or coa$co weight/species counts v <- apply(ggo[ig,],2,sum) ^^ returns gene-count/GOID for COA-factor itop gene set -- turn into GO-class + gene-counts per COA-factor ? ngo <- apply(ggo[ig,],2,sum) ngo <- sort( ngo[ngo>1], decreasing=TRUE) cbind( N=ngo, gocats[names(ngo),2:3]) #....... wtgo <- apply( wt * ggo[ig,],2,sum) # mangles names GO: to GO. # ^^ should this be mean over genes, rather than sum, of GO weighted by COA gene loading? wtgo <- wtgo[ order(abs(wtgo),decreasing=TRUE) ] names(wtgo) <- gsub("\\.",":", names(wtgo)) cbind( N=wtgo, gocats[names(wtgo),2:3]) ...... #?? list Nparalogs/species with Goterms ? goterm1genelist <- rownames( ggo[ ggo[,"GO:0019538"] > 0, ] ) gospp1gene <- apply( gspp.d.go[ goterm1genelist,flyorder], 2, sum, na.rm=TRUE) ... sptot <- apply( gspp.d.go[ ,flyorder], 2, sum, na.rm=TRUE) sppdf <- data.frame(matrix(nrow=0,ncol=12)); names(sppdf)<-flyorder for (i in names(wtgo)[1:20]) { iid <- rownames( ggo[ ggo[,i] > 0, ] ) spsum <- apply( gspp.d.go[ iid,flyorder], 2, sum, na.rm=TRUE) sppdf <- rbind( sppdf, spsum) } cbind( COA_li=wtgo, Ngenes=ngo, gocats[names(wtgo),2:3], sppdf)[1:20,] cbind( COA_wt=wtgo, N=ngo, round(sppdf - apply(sppdf,1,mean)), ## sppdf, Term=strtrim(as.character(gocats[names(wtgo),3]),15), Kind=gocats[names(wtgo),2] )[1:20,] DMEL gene COA. 3 dmel dsim dsec dyak dere dana dper dpse dwil dmoj dvir dgri C3 17 14 . . . -38 21 . -100 15 14 51 -Dwil +Dgri/vir/moj COA_wt dmel dsim dsec dyak dere dana dper dpse dwil dmoj dvir dgri Term Kind GO:0003824 -14.17 1257 1368 1329 1320 1233 1309 1421 1295 1372 1238 1234 1317 catalytic activ F GO:0006350 13.70 1066 1173 1084 1095 1015 1031 1167 984 1049 1047 1037 1104 transcription P GO:0005634 13.21 1466 1611 1535 1514 1414 1444 1639 1417 1480 1443 1434 1520 nucleus C GO:0009653 11.61 1279 1351 1282 1298 1187 1232 1422 1186 1244 1232 1211 1309 morphogenesis P GO:0007165 11.61 1478 1621 1532 1562 1431 1510 1640 1399 1482 1461 1428 1576 signal transduc P GO:0006629 -10.15 520 542 547 544 493 546 544 495 552 499 503 539 lipid metabolis P GO:0006118 -10.02 335 355 354 358 337 367 363 337 385 324 338 359 electron transp P GO:0030528 8.01 658 718 682 684 638 643 718 616 645 648 652 695 transcription r F GO:0003677 7.94 658 721 690 670 621 632 696 599 652 634 627 687 DNA binding F GO:0005886 7.84 669 709 662 660 608 627 731 595 624 623 604 672 plasma membrane C GO:0005515 7.62 856 900 859 863 796 830 930 802 822 818 814 863 protein binding F GO:0019538 -7.20 1347 1424 1367 1384 1282 1312 1406 1258 1344 1234 1211 1282 protein metabol P GO:0003676 6.92 714 783 766 737 694 702 795 702 716 698 711 729 nucleic acid bi F GO:0006139 6.67 768 802 789 773 735 761 811 725 738 733 733 759 P GO:0000166 6.57 991 1057 1006 992 931 959 1050 923 968 939 935 980 nucleotide bind F GO:0005198 6.57 845 885 867 880 796 815 934 809 849 810 806 859 structural mole F GO:0007010 6.23 513 547 529 506 477 484 575 480 510 489 484 526 cytoskeleton or P GO:0015031 5.90 565 603 582 568 535 568 626 569 577 539 560 588 protein transpo P GO:0005102 5.79 312 337 314 326 304 311 341 295 305 304 323 347 receptor bindin F GO:0008283 5.61 309 333 325 304 299 288 343 282 300 305 310 323 cell proliferat P strtrim(paste(gocats[1:10,3]),20) ## using all goids/gene, find wtgene,ngene per goid go1tab <- read.delim("gene-go1-modDM.tab") # site, goid names(wt) <- ig # genes golist <- levels(factor(go1tab$goid)) go1tab$goid <- as.character(go1tab$goid) go1tab$site <- as.character(go1tab$site) wtv <- gspp.d.go4.coa$li ## this takes forever, do in perl / write dudi.coa result list ngo1 <- data.frame( N=rep(0,length(golist)), matrix(0,nrow=length(golist),ncol=ncol(wtv)), # expand to Wt[2..df] row.names=golist); nc <- ncol(ngo1) ib <- 1; ie <- 2000 # iter takes long time for 70,000 gene-go rows for (i in ib:ie) { gi <- go1tab$goid[i] wt1 <- wtv[ go1tab$site[i], ] ; wt1[is.na(wt1)] <- 0 if(is.na(ngo1[gi,1])) ngo1[gi,]<- rep(0,nc) ngo1[gi,1] <- ngo1[gi,1] + 1 ngo1[gi,2:nc] <- ngo1[gi,2:nc] + wt1 ## expand ngo.wt[2..df] to all COA$li factors } #---- perl equivalent of above ------- # ~/Desktop/dspp-work/eugenomestats/readrgo3.pl #!/usr/bin/perl # readRgo.pl # cat ../gocat.tab gene-go1-modDM.tab gene4go-modDM-dmel-coa.out.data \ # | perl readgo.pl ## this should reproduce R result w/ main go classes: # cat ../gocat.tab gene-go-modDM.tab gene4go-modDM-dmel-coa.out.data ## output tweaks: add GOparent term(s); filter out C/location kind from 15 top.. # $dat="gene4go-modDM-dmel-coa.out.data"; # open(D,$dat); ## look for '$li' table of gene weights (row) x factor (col) my $dat={}; my (%gotab,%goids,%gocat,%gopar); while(<>){ chomp; if (/^GO:\d/) { # gocat table # GO:0008372 C cellular_component unknown # ^^ added GOParent column here (,sep id list) ($go,$gopar,$cterm)=split("\t",$_,3); $gocat{$go}= $cterm; $gopar{$go}= $gopar; } elsif(/\w\sGO:\d/){ # site/goid table ($gn,$go)=split; $gotab{$gn}=[] unless(ref $gotab{$gn}); $goids{$go}++; push(@{$gotab{$gn}},$go); ## R tables here... } elsif(/^\$(\S+)/){ $var=$1; my $newdat = { var => $var, col => {}, row => {}, mat => {}, }; $dat{$var}= $dat = $newdat; } elsif(/^([+-]?\d[\d\.e-]+)/) { # numeric row # -4.514108e-02 my @v=split; $rn="r1"; $dat->{row}->{$rn}++; if(ref $dat->{mat}->{$rn}) { push(@{$dat->{mat}->{$rn}}, @v); } else { $dat->{mat}->{$rn}= \@v; } } elsif(/^(\w\S+)/) { # row label + data? my @v=split; $rn= shift @v; $dat->{row}->{$rn}++; if(ref $dat->{mat}->{$rn}) { push(@{$dat->{mat}->{$rn}}, @v); } else { $dat->{mat}->{$rn}= \@v; } } elsif(/^\s*\[(\d+)\]/) { # row/vector label? $rn=$1; my @v=split; shift @v; $dat->{row}->{$rn}++; if(ref $dat->{mat}->{$rn}) { push(@{$dat->{mat}->{$rn}}, @v); } else { $dat->{mat}->{$rn}= \@v; } } elsif(/^\s+(\w\S+)/) { # column labels my @v=split; foreach $cn (@v) { $dat->{col}->{$cn}++; } } } print "parse COA genes factors to GO terms\n"; $nf = $dat{"nf"}->{mat}->{1}->[0]; print "nf=$nf\n"; $li = $dat{"li"}; die "no dat{li}" unless(ref $li); @col = sort keys %{$li->{col}}; @row = sort keys %{$li->{row}}; %mat = %{$li->{mat}}; # row hash only? ## ok here $nr= scalar(@row)-1; $nc= scalar(@col)-1; print "col[0..$nc]: ",join(",",@col[0..9]),"\n"; print "row[0..$nr]: ",join(",",@row[0..9]),"\n"; if(0) { my $topr= 5; print "----: "; foreach $c (0..$nc) { $v=$col[$c]; print "\t$v"; } print "\n"; foreach $r (0..$topr) { $rn= $row[$r]; print "$rn: "; my @v= @{$mat{$rn}}; foreach $c (0..$nc) { $v= $v[$c] || "NA"; print "\t$v"; } print "\n"; } print "----------\n\n"; } # gi <- go1tab$goid[i] # wt1 <- wtv[ go1tab$site[i], ] ; wt1[is.na(wt1)] <- 0 # if(is.na(ngo1[gi,1])) ngo1[gi,]<- rep(0,nc) # ngo1[gi,1] <- ngo1[gi,1] + 1 # ngo1[gi,2:nc] <- ngo1[gi,2:nc] + wt1 ## expand ngo.wt[2..df] to all COA$li factors $nr= scalar(@row)-1; $nc= scalar(@col)-1; %ngo=(); %wtgo=(); %gngo=(); for $i (0..$nr) { ## rows of gene ids $gn= $row[$i]; # or from $gotab[$i]->{goid} @goids= @{$gotab{$gn}}; next unless(@goids); @wt= @{$mat{$gn}}; foreach $go (@goids) { $ngo{$go}++; ## $gngo{$go}{$gn}++; next unless(@wt); $vt=0; for $j (0..$nc) { $v= $wt[$j]; $vt+=$v; $wtgo{$go}->[$j] += $v; } #? check for is.na($v) ?? $wtgo{$go}->[$nc+1] += $vt; # sum over all factors } } sub addwt { my($wtgo,$ngo,$goids,$wt)= @_; foreach my $go (@$goids) { $ngo->{$go}++; next unless(@$wt); my $vt=0; for my $j (0..$nc) { my $v= $wt[$j]; $vt+=$v; $wtgo->{$go}->[$j] += $v; } #? check for is.na($v) ?? $wtgo->{$go}->[$nc+1] += $vt; # sum over all factors } } $maxrow=20; # try lots? ## add short list of $gn ids with each GO id? @goids= keys %ngo; for $j (0..$nc+1) { print "COA factor. ",(($j>$nc)?"ALL":$j+1),"\n"; printf("%2s %10s %6s %6s %30s %4s %-39s\n"," ", qw(GO_Id COA_wt Ngenes Term Kind GO_Parent)); #GeneSample @topids = sort { abs($wtgo{$b}->[$j]) <=> abs($wtgo{$a}->[$j])} @goids; $nw= 0; for $i (0..$nr) { # stop at $maxrow $go= $topids[$i]; $wt= $wtgo{$go}->[$j]; $ng= $ngo{$go}; ## @gngo= sort keys %{$gngo{$go}}; $gngo= join(",",@gngo[0..4]); #?? ($cla,$term)= split("\t",$gocat{$go}); next if($cla eq "C"); # less interesting than F,P classes $gopar= $gopar{$go}; if($gopar) { #?? walk farther up gotree to near goslim levels? ($pid)= split(/,/,$gopar); (undef,$pterm)= split("\t", $gocat{ $pid }); $gopar.="/$pterm" if($pterm); } printf("%2d %10s %6.0f %6d %30s %4s %-39s\n",$i+1,$go,$wt,$ng,substr($term,0,30),$cla, substr($gopar,0,39));# ,$gngo $nw++; last if($nw>=$maxrow); } print "-"x78,"\n\n"; } #? look at sum weights across j factors? should be similar to mosaic/crosstab effects? #------------------------------------------------------ ## string values: #$call #dudi.coa(df = gspp.go, scannf = FALSE, nf = nspp - 1) #------------------------------------- ngo2 <- ngo1[(ngo1[,1] > 0),] # ip <- 2 igo2 <- order(abs(ngo2[,ip+1]),decreasing=TRUE) # ?? add cols of species gene counts/goterm (see above) dfgo2 <- data.frame( GO=gon1 <- rownames(ngo2[igo2,]), COA_wt=ngo2[igo2,ip+1], Ngenes=ngo2[igo2,1], Term=strtrim(as.character(gocats[gon1,3]),20), Kind=as.character(gocats[gon1,2]) ) cat(ansource,ip) dfgo2[1:20,] #......... ngo1 <- data.frame( N=rep(0,length(golist)), Wt=rep(0,length(golist)), # expand to Wt[2..df] row.names=golist); ni <- 20000 # nrow(go1tab) #for (i in 1:ni) ib <- 1001; ie <- 20000 ib <- 20001; ie <- 50000 for (i in ib:ie) { gi <- go1tab$goid[i] wt1 <- wt[ go1tab$site[i] ] ; wt1 <- ifelse(is.na(wt1),0,wt1) # wt1 <- wts[ go1tab$site[i], ] wt1[is.na(wt1)] <- 0 # ngo1[gi,...] <- ngo1[gi,...] + wt1 # vector add if(is.na(ngo1[gi,2])) ngo1[gi,]<- c(0,0) ngo1[gi,1] <- ngo1[gi,1] + wt1 ## expand ngo.wt[2..df] to all COA$li factors ngo1[gi,2] <- ngo1[gi,2] + 1 ## make col1 } ngo2 <- ngo1[(ngo1[,2] > 0),] ##ngo1 <- ngo2 igo1 <- order(abs(ngo2[,1]),decreasing=TRUE) # igo2 <- order(abs(ngo2[,1]/ngo2[,2]),decreasing=TRUE) # igo1 <- igo2 # gon1 <- rownames(ngo2[igo1,]) dfgo1 <- data.frame( COA_wt=ngo2[igo1,1], Ngenes=ngo2[igo1,2], GO=gon1 <- rownames(ngo2[igo1,]), Term=strtrim(as.character(gocats[gon1,3]),20), Kind=as.character(gocats[gon1,2]) ) cat(ansource,ip) dfgo1[1:20,] # gn <- as.character(go1tab[i,1]) # gi <- as.character(go1tab[i,2]) #..... # for (ip in 1:10) ansource <- "DMEL gene COA." ggo <- genego.d[,-1]; rownames(ggo) <- genego.d[,1] names(ggo) <- gsub("\\.",":", names(ggo)) ip <- 2 ig <- rownames(gspp.d.go4.coa$li) wt <- gspp.d.go4.coa$li[ ,ip] ngo <- apply( ggo[ig,],2,sum) wtgo <- apply( wt * ggo[ig,],2,sum) # mangles names GO: to GO. # wtngo <- wtgo/ngo #?? igo <- order(abs(wtgo),decreasing=TRUE) #wtgo <- wtgo[ igo ] #ngo <- ngo [ igo ] names(wtgo) <- gsub("\\.",":", names(wtgo)) cat(ansource,ip) cbind( COA_wt=wtgo[igo], Ngenes=ngo[igo], gocats[names(wtgo[igo]),2:3])[1:20,] DMEL gene COA. 1 COA_li Ngenes GOclass GOterm GO:0019538 84.45997 1157 P protein metabolism GO:0008233 64.22630 646 F peptidase activity GO:0009607 27.31463 494 P response to biotic stimulus GO:0005623 25.12691 1268 C cell GO:0005634 22.30829 1266 C nucleus GO:0005576 21.23335 311 C extracellular region GO:0030234 19.82908 341 F enzyme regulator activity GO:0006810 19.61091 782 P transport GO:0003824 19.34089 1124 F catalytic activity GO:0006350 19.18683 902 P transcription GO:0007582 18.80382 474 P physiological process GO:0000166 18.79317 825 F nucleotide binding GO:0006139 18.50991 665 P GO:0016787 18.49429 863 F hydrolase activity GO:0005488 18.12051 926 F binding GO:0003677 18.08657 539 F DNA binding GO:0009628 18.02500 253 P response to abiotic stimulus GO:0007165 17.88382 1238 P signal transduction GO:0006629 17.35503 458 P lipid metabolism GO:0016740 17.08391 535 F transferase activity DMEL gene COA. 2 > cbind( COA_li=wtgo, Ngenes=ngo, gocats[names(wtgo),2:3])[1:20,] COA_li Ngenes GOclass GOterm GO:0007165 14.213213 1238 P signal transduction GO:0006464 -10.740766 782 P protein modification GO:0006350 10.049239 902 P transcription GO:0005623 8.666412 1268 C cell GO:0003677 8.414568 539 F DNA binding GO:0003700 7.912096 381 F transcription factor activity GO:0030528 7.512992 560 F transcription regulator activity GO:0005488 -7.124703 926 F binding GO:0005622 -6.955147 284 C intracellular GO:0006810 6.757157 782 P transport GO:0003723 6.502301 349 F RNA binding GO:0008152 6.252770 464 P metabolism GO:0000166 6.132173 825 F nucleotide binding GO:0006629 5.618220 458 P lipid metabolism GO:0005102 5.371902 263 F receptor binding GO:0016740 5.311659 535 F transferase activity GO:0005975 4.999801 343 P carbohydrate metabolism GO:0007267 4.848512 429 P cell-cell signaling GO:0005216 -4.530751 160 F ion channel activity GO:0003824 -4.524383 1124 F catalytic activity DMEL gene COA. 3 > cbind( COA_li=wtgo, Ngenes=ngo, gocats[names(wtgo),2:3])[1:20,] COA_li Ngenes GOclass GOterm GO:0003824 -14.171912 1124 F catalytic activity GO:0006350 13.700137 902 P transcription GO:0005634 13.206056 1266 C nucleus GO:0009653 11.606526 992 P morphogenesis GO:0007165 11.605494 1238 P signal transduction GO:0006629 -10.145985 458 P lipid metabolism GO:0006118 -10.020528 316 P electron transport GO:0030528 8.007120 560 F transcription regulator activity GO:0003677 7.937698 539 F DNA binding GO:0005886 7.842280 505 C plasma membrane GO:0005515 7.623749 679 F protein binding GO:0019538 -7.197577 1157 P protein metabolism GO:0003676 6.917542 637 F nucleic acid binding GO:0006139 6.668585 665 P GO:0000166 6.574168 825 F nucleotide binding GO:0005198 6.573758 706 F structural molecule activity GO:0007010 6.226521 402 P cytoskeleton organization and biogenesis GO:0015031 5.898851 493 P protein transport GO:0005102 5.791552 263 F receptor binding GO:0008283 5.608207 258 P cell proliferation #---------------------------------------------------------------- ansource <- "MOUSE gene COA." ip <- 3 ig <- rownames(gspp.m.go4.coa$li) wt <- gspp.m.go4.coa$li[ ,ip] wtgo <- apply( wt * ggo[ig,],2,sum) # mangles names GO: to GO. ngo <- apply( ggo[ig,],2,sum) igo <- order(abs(wtgo),decreasing=TRUE) wtgo <- wtgo[ igo ] ngo <- ngo [ igo ] names(wtgo) <- gsub("\\.",":", names(wtgo)) cat(ansource,ip) cbind( COA_li=wtgo, Ngenes=ngo, gocats[names(wtgo),2:3])[1:20,] #------------------------------ # MOUSE gene COA (neg of DMEL COA; e.g. mouse genes high where dmel genes low?) MOUSE gene COA# 1 > cbind( COA_li=wtgo, Ngenes=ngo, gocats[names(wtgo),2:3])[1:20,] COA_li Ngenes GOclass GOterm GO:0005634 44.55835 2222 C nucleus GO:0005515 34.83762 2073 F protein binding GO:0006464 32.12167 886 P protein modification GO:0005488 29.55808 1876 F binding GO:0006350 29.31094 1273 P transcription GO:0007165 24.64337 1249 P signal transduction GO:0005623 24.26854 2951 C cell GO:0003676 22.79115 593 F nucleic acid binding GO:0005615 20.05129 1114 C extracellular space GO:0007275 19.05015 531 P development GO:0003677 17.87740 1195 F DNA binding GO:0007610 -17.14332 179 P behavior GO:0005198 16.79629 344 F structural molecule activity GO:0004872 16.73791 802 F receptor activity GO:0016740 16.73517 1047 F transferase activity GO:0003700 16.62836 572 F transcription factor activity GO:0005576 16.15551 133 C extracellular region GO:0005102 14.49992 158 F receptor binding GO:0003824 13.98142 1206 F catalytic activity GO:0005622 13.19633 707 C intracellular MOUSE gene COA# 2 COA_li Ngenes GOclass GOterm GO:0005623 -50.61427 2951 C cell GO:0005634 -37.23654 2222 C nucleus GO:0004872 -33.43963 802 F receptor activity GO:0009653 -30.60927 838 P morphogenesis GO:0005886 -29.82681 781 C plasma membrane GO:0005515 -29.63670 2073 F protein binding GO:0007165 -29.44257 1249 P signal transduction GO:0007275 -28.83126 531 P development GO:0005615 -26.79817 1114 C extracellular space GO:0005488 -26.77772 1876 F binding GO:0019538 -26.18432 774 P protein metabolism GO:0006350 -25.46829 1273 P transcription GO:0000166 -25.31183 1294 F nucleotide binding GO:0003677 -23.67286 1195 F DNA binding GO:0030154 -22.87822 328 P cell differentiation GO:0008233 -22.84640 486 F peptidase activity GO:0005654 -22.09996 388 C nucleoplasm GO:0030528 -18.87041 351 F transcription regulator activity GO:0005578 -17.34173 225 C extracellular matrix (sensu Metazoa) GO:0006464 -16.23301 886 P protein modification dmel dsim dsec dyak dere dana dper dpse dwil dmoj dvir dgri C2 15 19 14 . . -15 100 89 -40 -58 -76 -64 +Dpse/Dper -Dgri/Dvir/Dmoj MOUSE gene COA# 3 > cbind( COA_li=wtgo, Ngenes=ngo, gocats[names(wtgo),2:3])[1:20,] COA_li Ngenes GOclass GOterm GO:0005615 49.63980 1114 C extracellular space GO:0019538 37.28459 774 P protein metabolism GO:0005515 -30.40394 2073 F protein binding GO:0003824 27.51058 1206 F catalytic activity GO:0006350 -27.32144 1273 P transcription GO:0005739 26.75497 604 C mitochondrion GO:0005634 -26.24967 2222 C nucleus GO:0008233 22.35691 486 F peptidase activity GO:0005488 21.86088 1876 F binding GO:0003677 -18.30175 1195 F DNA binding GO:0006464 17.05164 886 P protein modification GO:0007154 -16.50187 348 P cell communication GO:0016787 15.13648 1175 F hydrolase activity GO:0007275 -14.04800 531 P development GO:0003700 -14.04539 572 F transcription factor activity GO:0005783 13.79713 379 C endoplasmic reticulum GO:0008152 -13.15446 418 P metabolism GO:0009607 11.74922 273 P response to biotic stimulus dmel dsim dsec dyak dere dana dper dpse dwil dmoj dvir dgri C3 . -12 . . . 39 . . 99 -33 -42 -22 +Dwil -Dvir/Dmoj MOUSE gene COA. 7 == Dvir vs Dmoj > cbind( COA_li=wtgo, Ngenes=ngo, gocats[names(wtgo),2:3])[1:20,] COA_li Ngenes GOclass GOterm GO:0003700 17.534932 572 F transcription factor activity GO:0003676 17.420877 593 F nucleic acid binding GO:0006810 16.592611 1179 P transport GO:0004871 -14.711267 378 F signal transducer activity GO:0005886 12.445884 781 C plasma membrane GO:0005856 11.809234 480 C cytoskeleton GO:0007010 11.241017 247 P cytoskeleton organization and biogenesis GO:0005634 11.215868 2222 C nucleus GO:0003824 10.925800 1206 F catalytic activity GO:0006996 10.728751 94 P organelle organization and biogenesis GO:0004872 -9.520720 802 F receptor activity GO:0004672 9.323379 429 F protein kinase activity GO:0030246 -9.307307 145 F carbohydrate binding GO:0009605 -8.986746 196 P response to external stimulus GO:0009607 -8.929930 273 P response to biotic stimulus GO:0006811 8.611295 455 P ion transport GO:0005623 -8.239872 2951 C cell GO:0005515 7.896609 2073 F protein binding GO:0015031 7.402791 420 P protein transport GO:0016740 -7.258702 1047 F transferase activity #.......... # test total weight of each gene on species factors # see reconst to build predict. coa data, compare to true / margins with mean(sqr( dev)) > (gspp.d.go[1:5,1:5]) dana dere dgri dmel dmoj CG1004 1 1 2 1 1 CG1007 1 1 1 0 1 CG1009 1 1 1 1 1 CG1014 1 1 0 1 0 CG1017 1 1 1 1 1 > gspp.mod0 <- reconst(gspp.coa,0) > (gspp.mod0[1:5,1:5]) dana dere dgri dmel dmoj CG1004 1.0748689 1.0315953 1.0979482 1.0690991 1.0321963 CG1007 0.9095045 0.8728883 0.9290331 0.9046223 0.8733969 CG1009 0.9921867 0.9522418 1.0134906 0.9868607 0.9527966 CG1014 0.7441400 0.7141814 0.7601180 0.7401455 0.7145975 CG1017 0.9921867 0.9522418 1.0134906 0.9868607 0.9527966 > gspp.mod1 <- reconst(gspp.coa,1) > (gspp.mod1[1:5,1:5]) dana dere dgri dmel dmoj CG1004 1.0730038 0.9593706 1.2348683 0.9198304 1.1443674 CG1007 0.9074395 0.7929225 1.0806284 0.7393550 0.9975905 CG1009 0.9919929 0.9447384 1.0277152 0.9713533 0.9644500 CG1014 0.7485005 0.8830376 0.4400085 1.0891250 0.4523494 CG1017 0.9919929 0.9447384 1.0277152 0.9713533 0.9644500 > mean((gspp.d.go-gspp.mod0)^2) [1] 0.2232531 > gspp.mod2 <- reconst(gspp.coa,2) > mean((gspp.d.go-gspp.mod2)^2) [1] 0.1428990 > gspp.mod11 <- reconst(gspp.coa,11) > mean((gspp.d.go-gspp.mod11)^2) [1] 1.106581e-26 > gspp.mod11 <- reconst(gspp.coa,9) > mean((gspp.d.go-gspp.mod11)^2) [1] 0.0195274 > gspp.mod11 <- reconst(gspp.coa,7) > mean((gspp.d.go-gspp.mod11)^2) [1] 0.0466109 > iiv <- order(abs(v<-(gspp.mod11 - gspp.mod0))[,"dgri"], decreasing=TRUE) > v[iiv[1:5],] dana dere dgri dmel dmoj dper dpse dsec CG18241 0.8018282 0.01918653 -2.670738 -0.8756168 -1.458545 2.0578381 1.5715318 0.1089261 CG9468 0.5258074 -0.03542985 -2.668093 -0.7250629 -1.395601 1.7260578 1.2831718 0.2936293 CG15096 -0.6826146 -0.56452421 2.533425 -0.8354241 1.540310 0.1580748 0.0607774 -0.9479936 CG12754 -0.6317056 -0.59371649 2.407684 -0.7945739 1.621137 -0.3775575 -0.3040762 -0.9324897 CG12275 -0.6391441 -0.46197251 2.350590 -0.6197888 1.424646 -0.1501115 -0.1574792 -0.7719592 dsim dvir dwil dyak CG18241 0.2012255 -1.275466 1.3345475 0.1852828 CG9468 0.5768021 -1.216098 1.7446240 -0.1098075 CG15096 -0.9016715 1.426835 -0.7649832 -1.0222104 CG12754 -0.8633393 1.495868 0.1455796 -1.1728089 CG12275 -0.7270428 1.308005 -0.6962645 -0.8594790 > iiv <- order(abs(v<-(gspp.mod11 - gspp.mod0))[,"dmoj"], decreasing=TRUE) > v[iiv[1:10],] dana dere dgri dmel dmoj dper dpse dsec CG12754 -0.6317056 -0.59371649 2.407684 -0.7945739 1.621137 -0.3775575 -0.3040762 -0.9324897 CG13095 -0.2638907 -0.47129014 2.323479 -0.4687003 1.594982 -0.7543515 -0.5105150 -1.0744594 CG15096 -0.6826146 -0.56452421 2.533425 -0.8354241 1.540310 0.1580748 0.0607774 -0.9479936 CG10045 -0.3919273 -0.62630571 1.767514 -0.7248291 1.526252 -1.0345217 -0.7262902 -0.9100291 CG9797 -0.2651857 -0.35990674 2.171670 -0.4515411 1.462318 -0.9403639 -0.6443804 -0.8288902 CG18241 0.8018282 0.01918653 -2.670738 -0.8756168 -1.458545 2.0578381 1.5715318 0.1089261 CG18420 -0.6199657 0.72170232 -1.660216 1.5629136 -1.449712 -0.5780490 -0.6192618 1.9997975 CG15408 -0.5169264 -0.62381106 1.908893 -0.7628568 1.445253 -0.4034082 -0.3090014 -0.9166715 CG30090 0.7925754 0.83325544 -1.728808 -0.5480940 -1.434976 0.3212423 0.3575376 1.2070492 CG12275 -0.6391441 -0.46197251 2.350590 -0.6197888 1.424646 -0.1501115 -0.1574792 -0.7719592 dsim dvir dwil dyak CG12754 -0.8633393 1.495868 0.1455796 -1.1728089 CG13095 -1.3319685 1.466617 0.3078947 -0.8177981 CG15096 -0.9016715 1.426835 -0.7649832 -1.0222104 CG10045 -0.8493244 1.413712 1.9266518 -1.3709019 CG9797 -0.9909846 1.321628 0.1516195 -0.6259822 CG18241 0.2012255 -1.275466 1.3345475 0.1852828 CG18420 2.6609332 -1.389425 -1.5382715 0.9095535 CG15408 -0.8424249 1.350330 0.9656793 -1.2950549 CG30090 1.4566958 -1.449780 -1.7424169 1.9357189 CG12275 -0.7270428 1.308005 -0.6962645 -0.8594790 > > iiv <- order(abs(v<-(gspp.mod11 - gspp.mod0))[,"dpse"], decreasing=TRUE) > v[iiv[1:10],] dana dere dgri dmel dmoj dper dpse dsec CG10862 -0.09062487 -0.3733037 -1.070510318 -0.3490475 -0.7368821 2.991891 2.057933 -0.4525263 CG32581 0.04896662 -0.3601356 -0.740303631 -0.6026571 -0.5498834 2.911092 2.044158 -0.6108976 CG6577 -0.07702156 -0.3654658 -1.040842398 -0.4014706 -0.7182538 2.952082 2.035427 -0.4468901 CG32401 0.33929120 -0.2425127 -1.626077684 -0.4549258 -0.9824497 2.752635 1.968122 -0.4050283 CG5245 -0.05889529 -0.3726506 -0.007804664 -0.6189013 -0.2023103 2.796808 1.957065 -0.7629172 CG32403 -0.48557649 -0.4325714 -0.459535137 -0.0853773 -0.4738292 2.939734 1.946122 -0.4356044 CG2025 -0.18817708 -0.4167753 -0.581990864 -0.4918680 -0.4569568 2.836301 1.945619 -0.5453772 CG2061 -0.18817708 -0.4167753 -0.581990864 -0.4918680 -0.4569568 2.836301 1.945619 -0.5453772 CG6756 -0.18817708 -0.4167753 -0.581990864 -0.4918680 -0.4569568 2.836301 1.945619 -0.5453772 CG8553 -0.18817708 -0.4167753 -0.581990864 -0.4918680 -0.4569568 2.836301 1.945619 -0.5453772 dsim dvir dwil dyak CG10862 -0.4635085 -0.52723601 -0.3506198 -0.6355649 CG32581 -0.7074516 -0.37508800 -0.5709690 -0.4868304 CG6577 -0.4486079 -0.51568737 -0.3621930 -0.6110758 CG32401 -0.5232962 -0.77160502 0.2454873 -0.2996397 CG5245 -0.9338821 -0.07015685 -1.2938623 -0.4324920 CG32403 -0.3832414 -0.27971759 -1.0192209 -0.8311828 CG2025 -0.5370244 -0.28347761 -0.5812765 -0.6989971 CG2061 -0.5370244 -0.28347761 -0.5812765 -0.6989971 CG6756 -0.5370244 -0.28347761 -0.5812765 -0.6989971 CG8553 -0.5370244 -0.28347761 -0.5812765 -0.6989971 > iiv <- order(abs(v<-(gspp.mod11 - gspp.mod0))[,"dmel"], decreasing=TRUE) > v[iiv[1:10],] dana dere dgri dmel dmoj dper dpse CG33244 -0.8372117 0.05241463 -0.4559476 3.374777 -0.4694572 -0.31569298 -0.5226995 CG32823 -1.2447269 0.02493205 -0.7468843 3.159532 -0.7113002 0.07328605 -0.3273040 CG3379 0.7118335 -0.22034213 1.4273676 -3.082777 1.2509944 -0.83417667 -0.2919906 CG31611 -0.9042420 -0.18481557 -1.4257197 2.970812 -0.9828735 1.17794225 0.5031370 CG1507 -0.4469163 0.07723554 -0.1526620 2.784280 -0.2059508 -0.61921633 -0.6298071 CG10327 -0.4469163 0.07723554 -0.1526620 2.784280 -0.2059508 -0.61921633 -0.6298071 CG6835 0.2737325 0.13428558 -0.9293137 2.771071 -0.5450715 -0.32801351 -0.3022893 CG1815 -0.3446410 0.09277287 -0.3023791 2.681678 -0.3273289 -0.30763206 -0.3927008 CG6103 -0.5300869 0.06962760 -0.3449687 2.679274 -0.3582849 -0.29884509 -0.4225938 CG6134 -0.5300869 0.06962760 -0.3449687 2.679274 -0.3582849 -0.29884509 -0.4225938 dsec dsim dvir dwil dyak CG33244 0.34548702 -0.097533663 -0.27896996 -0.3462687 -0.44889730 CG32823 0.77362131 0.774377153 -0.50301375 -0.5902955 -0.68222410 CG3379 -0.72553882 -0.460585027 0.99409429 1.2006496 0.03047107 CG31611 0.25988870 0.007188712 -0.67401600 0.2253517 -0.97265338 CG1507 0.08299799 -0.488699902 -0.07715558 -0.1262626 -0.19784326 CG10327 0.08299799 -0.488699902 -0.07715558 -0.1262626 -0.19784326 CG6835 -0.26746623 -1.262527372 -0.36277173 0.6865425 0.13182163 CG1815 0.04271690 -0.568415480 -0.18342212 -0.2912892 -0.09935888 CG6103 0.19312828 -0.260047015 -0.21161972 -0.2766929 -0.23889063 CG6134 0.19312828 -0.260047015 -0.21161972 -0.2766929 -0.23889063