# goblfunc.R # url.hspmap <- "/species/cgi-bin/gbrowse/" url.hspmap <- "http://melon.bio.indiana.edu/cgi-bin/gbrowse/" url.GOlookup <- "http://eugenes.org/fbservlet/goreport/id=" # gocats <- NULL idurls <- list( fruitfly="/flybase/cgi-bin/fbidq.html?", mouse="/mgi/idlookup/", worm="/wormbase/idlookup/", yeast="/sgd/idlookup/", ) ## need rewriting here: search?q=fban-SYM:(a+b+c) mgi?id=a,b,c worm?xxx=a+b+c ? listurls <- list( fruitfly="/flybase/lucegene/search?q=fban-SYM:", mouse="/mgi/idlist/", worm="/wormbase/idlist/", yeast="/sgd/idlist/", ) # http://www.informatics.jax.org/searches/accession_report.cgi?id=xxxx # where xxxx is a comma-separated list of Accession IDs. hreflink <- function(url,label) { paste("", label,"",sep="") } read.gocat <- function( file="../gocatall.tab" # goid,parent,class,term ) { goc <- read.delim(file,colClasses="character") rownames(goc) <- goc[,1] goc } godetails <- function( id, parent = TRUE, grandparent = TRUE ) { #??BAD??# if(!exists("gocats") || is.null(gocats)) gocats <- read.gocat() if(is.null(gocats)) return () gor <- gocats[id,] godet <- list(goid=gor$goid,class=gor$class,term=gor$term) if(parent || grandparent) { #par <- unlist(strsplit( gor$parent, ","))[1] #? walk up tree, only 1st parent? if(parent) godet <- c(godet,parent=gor$parent) if(grandparent) { pars <- c() prow <- gor pid <- gor$goid while(!is.na(pid)) { pars <- c(pars,pid) pid <- unlist(strsplit( prow$parent, ","))[1] if(is.na(pid)) break else prow <- gocats[pid,] } if(length(pars)>2) godet <- c(godet, grandparent=gocats[ pars[length(pars)-1], c("goid","term")]) ##else godet <- c(godet,parent=gor$parent) } } return (godet) } # remove ngenes outliers ?? winds.mean <- function(x, k=3){ y <- x[!is.na(x)] mu <- mean(y) stdev <- sd(y) outliers.up <- y[y>mu+k*stdev] outliers.lo <- y[y Drosophila species Gene Ontology matches ", "

",title,"

", "Legend: Blue bars: Mean gene count +/- 90% confidence interval
Red circle: expected value (population mean)

" ) } itemrefs <- function(gidlist) { hrefs <- c("
", "") nr <- 0 gord <- c() for (gi in gidlist) { gilab <- gi # paste(gid,v) if(exists("gocats")) gilab <- paste( as.character(gocats[gocats[,1] == gi,3]),collapse=" ") gord <- rbind(gord, c(gilab=gilab, gi=gi)) } for (i in order(gord[,"gilab"])) { gi <- gord[ i, "gi"] gilab <- gord[ i, "gilab"] hrefs <- c(hrefs, paste(" ",sep="") ) nr <- nr+1 if(nr %% 5 == 0) hrefs <- c(hrefs,"") } hrefs <- c(hrefs,"
",gilab,"
","
") hrefs } bestgenes <- function (gid, v, na.rm = FALSE) { xobs <- gop1[ gop1$GOID == gid & !is.na(gop1$GOID), c("species","query",v)] if(nrow(xobs)==0) return (list(ids=c(), qdev=c(), xobs=c(), dbs=c()) ) # qs <- levels(factor(xobs$query)) # ## patch for missing data (should be there...) for v = ngenes # ## for (qs) { for (species) { if xobs[spp,q] is missing, xobs.v <- 0 } } # for (q in qs) { for (s in levels(gop1$species)) { # if(sum(xobs$query == q & xobs$species == s)) xobs <- rbind(xobs, c(s,q,0)) # } } qs <- levels(factor(xobs$query)) xobs <- xobs[order(xobs$species),] gsmean <- aggregate(gop1[gop1$GOID == gid,v], list(Species = gop1[gop1$GOID == gid,"species"]), FUN=mean, na.rm = TRUE) gidmean <- gidmeans[gidmeans$Goid == gid,v] expect <- sppmeans[,v] + (gidmean - mean(sppmeans[,v])) expdif <- gsmean$x - expect # qdev <- data.frame( data=matrix(nrow=length(qs),ncol=4, ), row.names=qs) nr <- length(qs) qdev <- data.frame( mindev=rep(0,nr), obs=rep(0,nr), gsdev=rep(0,nr),expdev=rep(0,nr), row.names=qs) for (q in qs) { xo <- xobs[ xobs$query == q,v] ## want to minimize sum((xo - gsmean$x)^2) < sum((xo - expect)^2) xogs <- sum((xo - gsmean$x)^2) xoexp <- sum((xo - expect)^2) # gsexp <- sum( ((xo - gidmean) - (gsmean$x - expect))^2) # gsexp <- xogs * (xogs / xoexp)^2 gsexp <- xogs / xoexp ## this is it ** qdev[q,] <- list( mindev=gsexp, obs=sum(xo), gsdev=xogs, expdev=xoexp) } ## err here ? qdev? ##names(qdev)<-c("mindev","obs","gsdev","expdev") qd <- qdev[order(qdev$mindev),] ids <- row.names( qd) list(ids=ids, qdev=qd, xobs=xobs, dbs=levels(factor(gop1$DB)) ) } genetable <- function (gid, v, qdevbest=NULL, na.rm = FALSE) { ## ? check bids? may be many proteomes idurl <- idurls[[proteome]] #cgidurl listurl <- listurls[[proteome]] #cglisturl tabc <- "" #"\t" if(!is.null(qdevbest)){ bids <- qdevbest$ids xobs <- qdevbest$xobs qdev <- qdevbest$qdev dbs <- qdevbest$dbs ng <- length(bids) nq <- 0 for (q in bids) { #if(nq > 10 || qdev[q,"mindev"]>0.95) break if(nq > 10 || qdev[q,"mindev"]>1.0) break nq <- nq+1 } cat(gid, "Significant ",nq," of ",ng," total genes, contributing to ",v," deviation ","
\n") # add url to list of all gene reports ... bq <- paste(bids[1:nq],sep="+",collapse="+") bq <- gsub("-P.","",bq) if (!is.null(listurl)) cat( hreflink(paste(listurl,bq,sep=""), bq), "
\n") cat(tablehead()) nq <- 0 for (q in bids) { #if(nq > 10 || qdev[q,"mindev"]>0.95) break if(nq > 10 || qdev[q,"mindev"]>1.0) break qset <- (xobs$query == q) bq <- gsub("-P.","",q) # only for fly CG id gbq <- bq if(nq==0) cat( "","Query",tabc,paste( xobs[qset,"species"],collapse=tabc),"","\n",sep="" ) cat("") aidurl <- idurl if(charmatch("WBGene",q,nomatch = 0)==1) aidurl <- idurls["worm"] else { if(charmatch("MGI:",q,nomatch = 0)==1) { aidurl <- idurls["mouse"]; gbq <- sub("MGI:","",bq) # wildcard search in gbrowse with 'mgi:' fails, only does exact find } else { if(charmatch("S0",q,nomatch = 0)==1) aidurl <- idurls["yeast"] else { if(charmatch("CG",q,nomatch = 0)==1) aidurl <- idurls["fruitfly"] } } } cat( hreflink(paste(aidurl,bq,sep=""), bq)) # cat( tabc,paste(xobs[qset,v],collapse=tabc),"\n",sep="" ) ## add map link to numbers **** # $1 hvals <- paste( '', xobs[qset,v],"",sep="") cat( tabc,paste(hvals,collapse=tabc),"\n",sep="" ) cat("") nq <- nq + 1 } cat(tablefoot()) } } tablehead <- function(title=NULL,preface="") { tt <- "" if(!is.null(title)) { tshort <- sub(" *[({;\.].*","",title) tt <- paste("\n", "", tshort, "\n", "\n", "

",title,"

", preface) } paste(tt,"\n") } tablefoot <- function(footnote=NULL) { if(is.null(footnote)) { paste("
\n") } else { paste("\n",footnote,"\n\n") } } plotgenes <- function(gid, v, qdevbest=NULL, noplot = FALSE, na.rm = FALSE) { gsmean <- aggregate(gop1[gop1$GOID == gid,v], list(Species = gop1[gop1$GOID == gid,"species"]), FUN=mean, na.rm = TRUE) #gsciw <- aggregate(gop1[gop1$GOID == gid,v], list(Species = gop1[gop1$GOID == gid,"species"]), # FUN=cim, ci=0.95, na.rm = TRUE) expect <- sppmeans[,v] + (gidmeans[gidmeans$Goid == gid,v] - mean(sppmeans[,v])) ispoisson <- FALSE ## (v == "ngenes"|v == "nparalog") if(ispoisson) { gsli <- aggregate(gop1[gop1$GOID == gid,v], list(Species = gop1[gop1$GOID == gid,"species"]), FUN=cipoisson, ci=0.95, tlower=TRUE, na.rm = TRUE) gsui <- aggregate(gop1[gop1$GOID == gid,v], list(Species = gop1[gop1$GOID == gid,"species"]), FUN=cipoisson, ci=0.95, tlower=FALSE, na.rm = TRUE) } else { gsli <- aggregate(gop1[gop1$GOID == gid,v], list(Species = gop1[gop1$GOID == gid,"species"]), FUN=cim, ci=0.95, na.rm = TRUE) gsui <- gsli } if(noplot != TRUE) { par(mar=c(4,4,1,1)) #plotCI( x=gsmean[,2], uiw=gsciw[,2], ylab=paste(v,gid,sep="-"), xlab="Species", gap=FALSE, col="black", barcol="blue", xaxt="n") plotCI( x=gsmean[,2], uiw=gsui[,2], liw=gsli[,2], ylab=v, xlab="Species", gap=FALSE, col="black", barcol="blue", xaxt="n") points(expect,pch=23,col="red") axis(side=1, at=1:nrow(sppmeans), labels=as.character(sppmeans$Species), cex.axis=1.5) plab <- gid # paste(gid,v) if(exists("gocats")){ plab <-paste( gid, as.character(gocats[gocats[,1] == gid,3]),collapse=" ") } mtext( plab, side=3, line = -2 ) } diff <- 1.1 * abs(gsmean[,2] - expect) # sigdif <- max( 1.1 * diff - gsui[,2]) sigdif <- max( diff - gsli[,2], diff - gsli[,2]) sigdif } count <- function (x, na.rm = FALSE) { if (is.matrix(x)) apply(x, 2, count, na.rm = na.rm) else if (is.vector(x)) ifelse(na.rm, sum(!is.na(x)), length(x)) else if (is.data.frame(x)) sapply(x, count, na.rm = na.rm) else count(as.vector(x), na.rm = na.rm) } cim <- function (x, ci = 0.975, na.rm = FALSE) { if (is.matrix(x)) apply(x, 2, cim, ci = ci, na.rm = na.rm) else if (is.vector(x)) { nd <- ifelse(na.rm, sum(!is.na(x)), length(x)) qt(ci, nd) * sqrt(var(x, na.rm = na.rm) / nd) } else if (is.data.frame(x)) sapply(x, cim, ci = ci, na.rm = na.rm) else sapply(as.vector(x), cim, ci = ci, na.rm = na.rm) } cilog <- function (x, ci = 0.975, na.rm = FALSE) { if (is.matrix(x)) apply(x, 2, cilog, ci = ci, na.rm = na.rm) else if (is.vector(x)) { nd <- ifelse(na.rm, sum(!is.na(x)), length(x)) qt(ci, nd) * sqrt(var(log(x), na.rm = na.rm) / nd) } else if (is.data.frame(x)) sapply(x, cilog, ci = ci, na.rm = na.rm) else sapply(as.vector(x), cilog, ci = ci, na.rm = na.rm) } meanlog <- function (x, ci = 0.975, na.rm = FALSE) { if (is.matrix(x)) apply(x, 2, meanlog, ci = ci, na.rm = na.rm) else if (is.vector(x)) { mean(log(x)) } else if (is.data.frame(x)) sapply(x, meanlog, ci = ci, na.rm = na.rm) else sapply(as.vector(x), meanlog, ci = ci, na.rm = na.rm) } #see cim: conf.intervals for mean of poisson distr., e.g. gene counts cipoisson <- function (x, ci = 0.975, tlower=TRUE, na.rm = FALSE) { if (is.matrix(x)) apply(x, 2, cipoisson, ci = ci, na.rm = na.rm) else if (is.vector(x)) { nd <- ifelse(na.rm, sum(!is.na(x)), length(x)) mn <- mean(x, na.rm = na.rm) #? which is it? # psd <- sqrt(mn) psd <- sqrt(mn/nd) qt(ci, nd) * ppois(psd, mn, lower = tlower) # psem <- sqrt(mn/nd) } else if (is.data.frame(x)) sapply(x, cipoisson, ci = ci, na.rm = na.rm) else sapply(as.vector(x), cipoisson, ci = ci, na.rm = na.rm) } sem <- function (x, na.rm = FALSE) { if (is.matrix(x)) apply(x, 2, sem, na.rm = na.rm) else if (is.vector(x)) { nd <- ifelse(na.rm, sum(!is.na(x)), length(x)) sqrt(var(x, na.rm = na.rm) / nd) } else if (is.data.frame(x)) sapply(x, sem, na.rm = na.rm) else sapply(as.vector(x), sem, na.rm = na.rm) } # =========================================