.packageName <- "AnnBuilder"
# A script to build annotation data packages.
# pkgName - a character string for the name of the data package to be
#           built (e. g. hgu95a, rgu34a)
# organism - a character string for the name of the organism of
#            concern (now can only be "human", "mouse", or "rat")
# pkgPath - a character string for the full path of an existing
#           directory where the built backage will be stored
# version - a character string for the version number
# baseName - a character string for the name of a file to be used as a
#            base file to base source data.
# baseMapType - a character string that is either "gb" or "ug" to
#             indicate whether the probe ids in baseName are mapped to
#             GenBack accession numbers or UniGene ids.
# otherSrc - a vector of named character strings for the names of files that
#          contain mappings between probe ids of baseName and
#          LobusLink ids that will be used to obtain the unified
#          mappings between probe ids of baseName and LocusLink ids
#          based on all the sources. The strings should not contain
#          any number.
# makeXML - a boolean to indicate whether an XML version will also be
#         generated.
# srcUrls - a vector of names character strings for the urls where
#         source data files will be retained. Valid sources are LocusLink,
#         UniGene, Golden Path, Gene Ontology, and KEGG. The names for the
#         character strings should be LL, UG, GP, GO, and KEGG,
#         respectively. LL and UG are required.
# author - a named vector of character string with a name element for
#          the name of the author and address element for the email
#          address of the author.
#
# This function remains to be long to avoid passing large data files
# around considering the size of the data files being processed.
#
# Copyright 2003, J. Zhang, all rights reserved.
#

ABPkgBuilder <- function(baseName, srcUrls, baseMapType = c("gb",
                         "ug", "ll", "image"),
                         otherSrc = NULL, pkgName, pkgPath,
                         organism = c("human", "mouse", "rat"),
                         version = "1.1.0", makeXML = TRUE,
                         author = list(author = "who",
                         maintainer = "who@email.com"), fromWeb = TRUE){
    baseMapType <- match.arg(baseMapType)
    switch(baseMapType,
           gb = baseParser <- getBaseParsers("gb"),
           ug = baseParser <- getBaseParsers("ug"),
           image = baseParser <- getBaseParsers("image"),
           ll = baseParser <- getBaseParsers("ll"),
           stop("Invalid imput for baseMapType"))
    organism <- match.arg(organism)
    switch(organism,
           human =,
           mouse =,
           rat = TRUE,
           stop("Invalid input for organism"))
    print("It may take me a while to process the data. Be patient!")
    # Force upper letters
    names(srcUrls) <- toupper(names(srcUrls))
    if(is.null(srcUrls["LL"]) || is.null(srcUrls["UG"]) ||
       is.null(srcUrls["GO"])){
        stop("Source ulrs for LocusLink, UniGene, and GO are required!")
    }
    # Write AnnInfo to .GlobalEnv to make Annotation information available
    makeSrcInfo()
    # Set the path to parsers and base files
    path <- file.path(.path.package("AnnBuilder"), "data")
    # Get the parsers to map probe ids to LL ids through GB or UG ids
 #   baseParsers <- getBaseParsers(baseMapType)
    # Read in the base file
    options(show.error.messages = FALSE)
    base <- try(matrix(scan(baseName, what = "", sep = "\t", quote = "",
                        quiet = TRUE), ncol = 2, byrow = TRUE))
    options(show.error.messages = TRUE)
    if(inherits(base, "try-error")){
        stop(paste("Base file", baseName,
                   "may not be valid or have two columns"))
    }
    colnames(base) <- c("PROBE", "ACC")
    # Need to get source data from LocusLink(LL), UniGene(UG),
    # GoldenPath(GP), Gene Ontology(GO), and KEGG(KEGG). Instantiate
    # objects for each of them. Default values were right at the time
    # of writing but may be wrong later. Make corrections as needed.
    ll <- LL(srcUrl = srcUrls["LL"],
             parser = baseParser["LL"], baseFile = baseName)
    ug <- UG(srcUrl = srcUrls["UG"],
             parser = baseParser["UG"], baseFile = baseName,
             organism = organism)
    go <- GO(srcUrl = srcUrls["GO"])
    if(!is.na(srcUrls["GP"])){
        gp <- GP(srcUrl = srcUrls["GP"], organism = organism)
    }
    if(!is.na(srcUrls["KEGG"])){
        kegg <- KEGG(srcUrl = srcUrls["KEGG"], organism = organism)
    }
    if(baseMapType != "ll"){
        if(baseMapType == "image"){
            unified <- unifyMappings(base, NULL, ug, otherSrc, fromWeb)
        }else{
            unified <- unifyMappings(base, ll, ug, otherSrc, fromWeb)
        }
    }else{
        unified <- baseName
    }
    # Using the unified mapping as the base file, the data file
    # (ll_tmpl) can be parsed to get annoation data from LocusLink
    # using the correct parser
    parser(ll) <- file.path(path, "llParser")
    baseFile(ll) <- unified
    options(show.error.messages = FALSE)
    annotation <- try(parseData(ll, ncol = 15, fromWeb = fromWeb,
                         mergeKey = FALSE))
    options(show.error.messages = TRUE)
    if(inherits(annotation, "try-error")){
        stop(paste("Parsing LocusLink data failed because of:\n\n", annotation))
    }
     colnames(annotation) <- c("PROBE", "ACCNUM", "LOCUSID", "UNIGENE",
                                  "GENENAME", "SYMBOL","CHR", "MAP",
                                  "PMID", "GRIF", "SUMFUNC", "GO",
                                  "OMIM", "NM", "NP")
    if(!is.na(srcUrls["GP"])){
        # Get strand data by processing data from GoldenPath
        options(show.error.messages = FALSE)
        strand <- try(getStrand(gp))
        options(show.error.messages = TRUE)
        if(inherits(strand, "try-error")){
            stop(paste("Failed to parse Golden Path data because of:\n\n",
                       strand))
        }
        # An extra merge is need to keep column order of annoation
        # unchanged after merging
#        strand <- merge(annotation[,c("PROBE", "LOCUSID")],
#                    strand, by = "LOCUSID", all.x = TRUE)[,c("PROBE",
#                                            "CHRORI", "CHRLOC")]
        annotation <- merge(annotation, strand, by = "LOCUSID", all.x = TRUE)
    }
    if(!is.na(srcUrls["KEGG"])){
        # Adding KEGG pathway data to annoataion
        pathNEnzyme <- mapLL2ECNPName(kegg)
        annotation <- merge(annotation, pathNEnzyme$llpathname,
                            by = "LOCUSID", all.x = TRUE)
        annotation <- merge(annotation, pathNEnzyme$llec,
                            by = "LOCUSID", all.x = TRUE)
    }
    if(!is.na(srcUrls["HG"])){
        intID2LL <- getLL2IntID(procHomoData(srcUrls["HG"]), organism)
        annotation <- merge(annotation, intID2LL,
                            by = "LOCUSID", all.x = TRUE)

    }
    annotation <- as.matrix(annotation)
    # Convert "" to NA
    annotation[annotation == ""] <- NA
    # Probe id can not be NULL
    annotation <- annotation[!is.null(annotation[, "PROBE"]),]
    if(makeXML){
        multC <- colnames(annotation)[is.element(colnames(annotation),
                                             getMultiColNames())]
        typeC <- colnames(annotation)[is.element(colnames(annotation),
                                             getTypeColNames())]
        XMLOut <- file.path(pkgPath, paste(pkgName, ".xml", sep = ""))
        fileToXML(targetName = pkgName, outName = XMLOut,
                  inName = annotation, colNames = "",
                  idColName = "PROBE", multColNames = multC,
                  typeColNames = typeC, isFile = FALSE, version = version)
    }
    # Create a data package with no data
    createEmptyDPkg(pkgName, pkgPath, force = TRUE)
    # Write data to the package for one to one mappings
    for(i in getUniColNames()){
        env <- new.env(hash = TRUE, parent = NULL)
        if(i == "LOCUSID"){
            multiassign(annotation[,"PROBE"],
                        as.integer(annotation[,i]), env)
        }else{
            multiassign(annotation[,"PROBE"], as.vector(annotation[,i]), env)
        }
        assign(paste(pkgName, i, sep = ""), env)
        save(list = paste(pkgName, i, sep = ""), file =  file.path(pkgPath,
             pkgName, "data", paste(pkgName, i, ".rda", sep = "")))
    }

    # Write data to the package for one to many mappings with
    # multiple mappings separated by a ";"
    for(i in intersect(colnames(annotation), getMultiColNames())){
        env <- new.env(hash = TRUE, parent = NULL)
        multiassign(annotation[,"PROBE"],
               lapply(annotation[,i], splitEntry), env)
        assign(paste(pkgName, i, sep = ""), env)
        save(list = paste(pkgName, i, sep = ""), file =  file.path(pkgPath,
             pkgName, "data", paste(pkgName, i, ".rda", sep = "")))
    }
    # Write the mappings between probe ids and evidence provided by LL
    # and  CHRORI and CHRLOC. All of them have a "@" separating some
    # descriptive values for an entry. Multiple entries are separated
    # by ";"
    for(i in intersect(colnames(annotation), c("GO", "CHRLOC"))){
        env <- new.env(hash = TRUE, parent = NULL)
        if(i == "CHRLOC"){
            multiassign(annotation[, "PROBE"],
                        lapply(annotation[, i], twoStepSplit,
                               asNumeric = TRUE), env)
        }else{
            multiassign(annotation[, "PROBE"],
                        lapply(annotation[, i], twoStepSplit), env)
        }
        assign(paste(pkgName, i, sep = ""), env)
        save(list = paste(pkgName, i, sep = ""),
             file =  file.path(pkgPath, pkgName, "data",
             paste(pkgName, i, ".rda", sep = "")))
    }
    # Get the reverse mapping between PubMed, pathway, enzyme and probe
    # ids. Column names have to be modified for each reverse mapping
    env <- cols2Env(annotation[,c("PMID", "PROBE")],
                    colNames = c("PMID", "PROBE"), keyColName = "PMID")
    assign(paste(pkgName, "PMID2PROBE", sep = ""), env)
    save(list = paste(pkgName, "PMID2PROBE", sep = ""),
         file =  file.path(pkgPath, pkgName, "data",
         paste(pkgName, "PMID2PROBE.rda", sep = "")))
    if(!is.na(srcUrls["KEGG"])){
        env <- cols2Env(annotation[,c("PATH", "PROBE")],
                        colNames = c("PATH", "PROBE"), keyColName = "PATH")
        assign(paste(pkgName, "PATH2PROBE", sep = ""), env)
        save(list = paste(pkgName, "PATH2PROBE", sep = ""),
             file =  file.path(pkgPath, pkgName, "data",
             paste(pkgName, "PATH2PROBE.rda", sep = "")))
        env <- cols2Env(annotation[,c("ENZYME", "PROBE")],
                        colNames = c("ENZYME", "PROBE"),
                        keyColName = "ENZYME")
        assign(paste(pkgName, "ENZYME2PROBE", sep = ""), env)
        save(list = paste(pkgName, "ENZYME2PROBE", sep = ""),
             file =  file.path(pkgPath, pkgName, "data",
             paste(pkgName, "ENZYME2PROBE.rda", sep = "")))
    }
    # Reassign a parser and base file to parse LocusLink data to map
    # GO ids to probe ids
    parser(ll) <- file.path(path, "GO2ProbeParser")
    options(show.error.messages = FALSE)
    go2Probe <- try(parseData(ll, ncol = 3, fromWeb = fromWeb))
    options(show.error.messages = TRUE)
    if(inherits(go2Probe, "try-error")){
        stop(paste("Failed to parse LocusLink data because of:\n\n",
                   go2Probe))
    }
    if(is.null(go2Probe)){
        warning("No mappings between GO and probe ids found")
    }else if(nrow(go2Probe) == 1){
        go2Probe <- matrix(go2Probe[, 1:2], nrow = 1)
        colnames(go2Probe) <- c("GO", "GO2PROBE")
        env <- new.env(hash = TRUE, parent = NULL)
        assign(go2Probe["GO"], go2Probe["GO2Probe"], env)
    }else{
        colnames(go2Probe) <- c("GO", "GO2PROBE", "COUNTS")
        # Drop column counts for now. Will get ride of the extra column
        go2Probe <- go2Probe[, c("GO", "GO2PROBE")]
        # Remove "" for GO id resulting from mapping probe ids to NA GO id
        go2Probe <- go2Probe[go2Probe[,1] != "", ]
        # Write GO id to probe id mappings to data package
        env <- new.env(hash = TRUE, parent = NULL)
        multiassign(as.vector(go2Probe[, 1]),
                    sapply(go2Probe[, 2], splitEntry), env)
        assign(paste(pkgName, "GO2PROBE", sep = ""), env)
        save(list = paste(pkgName, "GO2PROBE", sep = ""),
             file =  file.path(pkgPath, pkgName, "data",
             paste(pkgName, "GO2PROBE.rda", sep = "")))
    }
    # Get the mappings between GO ids to all the probe ids (including
    # children)
    options(show.error.messages = FALSE)
    goData <- try(readData(go, xml = TRUE, fromWeb = fromWeb))
    options(show.error.messages = TRUE)
    go2All <- mapGO2AllProbe(go2Probe, goData, "", sep = ";", all = TRUE)
    # Write GO id to all probe ids mappings to data package
    env <- new.env(hash = TRUE, parent = NULL)
    multiassign(names(go2All), sapply(go2All, splitEntry), env)
    assign(paste(pkgName, "GO2ALLPROBES", sep = ""), env)
    save(list = paste(pkgName, "GO2ALLPROBES", sep = ""),
         file =  file.path(pkgPath, pkgName, "data",
         paste(pkgName, "GO2ALLPROBES.rda", sep = "")))
    go2All <- cbind(names(go2All), go2All)
    colnames(go2All) <- c("GO", "GO2ALLPROBES")
    if(makeXML){
        if(!is.null(go2All) && !is.null(go2Probe)){
            mergedGO <- as.matrix(merge(as.matrix(go2All),
                          as.matrix(go2Probe), by = "GO", all.x = TRUE))
            XMLByNum <- file.path(pkgPath, paste(pkgName, "ByNum.xml",
                                                 sep = ""))
#            fileToXML("GOByNum", XMLByNum, mergedGO, "GO",
#                      c("GO", "GO2ALLPROBES", "GO2PROBE"),
#                      c("GO2PROBE","GO2ALLPROBES"), "", ";", "",
#                      organism = "GO", isFile = FALSE, version = version)
            fileToXML(targetName = "GOByNum", outName = XMLByNum,
                  inName = mergedGO, colNames = "",
                  idColName = "GO", multColNames = c("GO2ALLPROBES",
                                    "GO2PROBE"),
                  typeColNames = "", isFile = FALSE, version = version)
        }
    }
    writeAccessory(pkgName, pkgPath, organism, version, author)
    # Write the quality control data
    # Quality control
    getDPStats(baseName, pkgName, pkgPath)
    writeMan4QC(pkgName, pkgPath)
    # Clean up
    unlink(unified)
}


unifyMappings <- function(base, ll, ug, otherSrc, fromWeb){
    trusted <- NULL
    # Get the unified mappings between probe ids and locusLink ids
    # based on multiple sources.
    if(!is.null(ll)){
        options(show.error.messages = FALSE)
        llMapping <- try(parseData(ll, fromWeb = fromWeb))
        options(show.error.messages = TRUE)
        if(inherits(llMapping, "try-error")){
            stop(paste("Failed to get or parse LocusLink data",
                       "because of:\n\n",llMapping))
        }
        colnames(llMapping) <- c("PROBE", "LL")
    }
    if(!is.null(ug)){
        options(show.error.messages = FALSE)
        ugMapping <- try(parseData(ug, fromWeb = fromWeb))
        options(show.error.messages = TRUE)
        if(inherits(ugMapping, "try-error")){
            stop(paste("Failed to get or parse UniGene data becaus of:\n\n",
                       ugMapping))
        }
        colnames(ugMapping) <- c("PROBE", "UG")
    }
    # Merge data from all the sources by probe id based on the base
    # file. The probe id is the first column ("V1") in this case.
    # Merge two at a time.
    if(!is.null(ll) && nrow(llMapping) > 0){
        merged <- merge(base, llMapping, by = "PROBE", all.x = TRUE)
        trusted <- "LL"
         # Free the space
        llMapping <- NULL
    }else{
        merged <- base
    }
    if(!is.null(ug) && nrow(ugMapping) > 0){
        merged <- merge(merged, ugMapping, by = "PROBE", all.x = TRUE)
        trusted <- c(trusted, "UG")
        ugMapping <- NULL
    }
    # The following three mappings are for Affymetrix gene chips.
    # Skip if the target of annotation is not a Affymetrix chip with
    # existing mapping data file in the data directory of AnnBuilder.
    if(!is.null(otherSrc)){
        for(i in names(otherSrc)){
            options(show.error.messages = FALSE)
            temp <- try(matrix(scan(otherSrc[i], what = "", sep ="\t",
                         quote = "", quiet = TRUE), ncol = 2, byrow = TRUE))
            options(show.error.messages = TRUE)
            if(inherits(temp, "try-error")){
                stop(paste("File", otherSrc[i],
                           "may not be valid or have two columns"))
            }
            colnames(temp) <- c("PROBE", i)
            merged <- merge(merged, temp, by = "PROBE", all.x = TRUE)
        }
    }
    # Change "NA" tp NA
    merged <- as.matrix(merged)
    merged[merged == "NA"] <- NA
    # Get the unified mapping based on all the sources used
    unified <- resolveMaps(merged, trusted = trusted,
                            srcs = c(names(otherSrc)))
    return(unified)
}

getBaseParsers <- function(baseMapType = c("gb", "ug", "image", "ll")){
    type <- match.arg(baseMapType)
    path <- file.path(.path.package("AnnBuilder"), "data")
    switch(type,
           gb = return(c(LL = file.path(path, "gbLLParser"),
                         UG = file.path(path, "gbUGParser"))),
           ug = return(c(LL = file.path(path, "ugLLParser"),
                         UG = file.path(path, "ugUGParser"))),
           image = return(c(UG = file.path(path, "imageUGParser"),
                          LL = "NA")),
           ll = return(c(LL = file.path(path, "llParser"),
                         UG = "NA")),
           stop("Invalid type"))
}

createEmptyDPkg <- function(pkgName, pkgPath,
                            folders = c("man", "R", "data"), force = TRUE){
    if(file.exists(file.path(pkgPath, pkgName))){
        if(!force){
            stop(paste("Package", pkgName, "already exists"))
        }else{
            unlink(file.path(pkgPath, pkgName), TRUE)
        }
    }
    dir.create(file.path(pkgPath, pkgName))
    for(i in folders){
        dir.create(file.path(pkgPath, pkgName, i))
    }
}
# Split multiple entry for a given mapping
splitEntry <- function(dataRow, sep = ";", asNumeric = FALSE){
    if(is.na(dataRow) || is.null(dataRow) || dataRow == ""){
        return(NA)
    }else{
        if(asNumeric){
            return(as.numeric(unlist(strsplit(dataRow, sep))))
        }else{
            return(unlist(strsplit(dataRow, sep)))
        }
    }
}
# Split multiple entry with two separaters (e. g. 12345@18;67891@18)
twoStepSplit <- function(dataRow, entrySep = ";", eleSep = "@",
    asNumeric = FALSE){
    splitEle <- function(entry){
        if(!is.na(entry)){
            temp <- unlist(strsplit(entry, eleSep), use.names = FALSE)
            if(asNumeric){
                elements <- as.numeric(temp[1])
            }else{
                elements <- temp[1]
            }
            names(elements) <- temp[2]
            return(elements)
        }else{
            return(NA)
        }
    }
    temp <- unlist(strsplit(dataRow, entrySep), use.names = FALSE)
    # Force "NA" to be NA
    temp[temp == "NA"] <- NA
    return(unlist(lapply(temp, splitEle), use.names = TRUE))
}

# This function takes two data columns from a data file (e. g. two
# columns) and generates an environment object with
# values in one of the columns as keys and values in the other as
# values. A given row in either column may have multiple values
# separated by a separator(e. g. "a;b") and has to be dealt with.
#
# Copyright 2002, J. Zhang, all rights reserved
#

cols2Env <- function(cols, colNames, keyColName = colNames[1], sep = ";"){

    # Environment to return
    dataEnv <- new.env(hash = TRUE, parent = NULL)

    # Force a matrix
    cols <- as.matrix(cols)
    if(!missing(colNames)){
        colnames(cols) <- colNames
    }

    if(ncol(cols) != 2){
        stop("The input data do not have two columns")
    }
    # Determine the column name
    valColName <- colNames[colNames != keyColName]
    # Match cols by finding matches for all the potential multiple
    # matches
    cols <- matchAll(cols, keyColName)
    colnames(cols) <- colNames
    #Get unduplicated row
    unDups <- cols[!duplicated(cols[,keyColName]),]
    colnames(unDups) <- colNames
    # Write unique keys and values
    multiassign(unDups[, keyColName], unDups[, valColName], dataEnv)
    # Process the duplications
    dups <- cols[duplicated(cols[,keyColName]),]
    colnames(dups) <- colNames
    if(nrow(dups) > 0){
        for(i in 1:nrow(dups)){
            assign(dups[i,keyColName], unique(c(get(dups[i, keyColName],
                                   dataEnv), dups[i,valColName])), dataEnv)
        }
    }

    return(dataEnv)
}

matchAll <- function(cols, keyColName){
    matched <- NULL

    for(i in 1:nrow(cols)){
        temp <- matchOneRow(cols[i,], keyColName)
        if(!is.null(temp)){
            if(is.null(matched)){
                matched <- temp
            }else{
                matched <- rbind(matched, temp)
            }
        }
    }
    return(matched)
}

matchOneRow <- function(cols, keyColName, sep = ";"){

    doMatch <- function(col1id){
        if(is.null(matched)){
            matched <<- cbind(col1id, col2)
        }else{
            matched <<- rbind(matched, cbind(col1id, col2))
        }
    }

    matched <- NULL
    # Key can not be any of these
    if(cols[keyColName] == "" || is.na(cols[keyColName])||
       is.null(cols[keyColName])){
        return(NULL)
    }

    col1 <- unlist(strsplit(cols[1], sep), use.names = FALSE)
    col2 <- unlist(strsplit(cols[2], sep), use.names = FALSE)
    # If no mapping is found for key, assign NA to key
    if(match(keyColName, names(cols)) == 1 && length(col2) == 0){
        return(cbind(col1, NA))
    }else if(match(keyColName, names(cols)) == 2 && length(col1) == 0){
        return(cbind(NA, col2))
    }
    # Otherwise, do a full matching
    sapply(col1, doMatch)

    return(matched)
}

# This function creates descriptive statistics for a given data
# package.
#
# Copyright 2002, J. Zhang. All rights reserved.
#

getDPStats <- function(baseF, pkgName, pkgPath, saveList = TRUE,
                       isFile = TRUE){

    dateBuilt <- getDate(pkgName, pkgPath)
    keyMapped <- getProbeNum(pkgName, pkgPath, TRUE)
    probeBased <- paste(pkgName, getPBased(), sep = "")

    otherMapped <- NULL
    for(i in setdiff(names(keyMapped), probeBased)){
        if(any(i == names(keyMapped))){
            tempNum <- as.numeric(keyMapped[i])
            names(tempNum) <- i
            otherMapped <- c(otherMapped, tempNum)
        }
    }

    if(!is.null(baseF) && !is.na(baseF) && baseF != ""){
        if(isFile){
            srcNumProbes <- length(readLines(baseF))
        }else{
            srcNumProbes <- nrow(baseF)
        }
        numProbes <- getProbeNum(pkgName, pkgPath, FALSE)
        keyMissMatch <- matchProbes(baseF, pkgName, pkgPath,
                                    probeBased, isFile = isFile)

        numMissMatch <- "None"
        for(i in names(numProbes)){
            if(any(i == probeBased) && numProbes[i] != srcNumProbes){
                if(numMissMatch == "None"){
                    numMissMatch <- NULL
                }
                numMissMatch <- c(numMissMatch, i)
            }
        }

        probeMapped <- NULL
        for(i in names(keyMapped)){
            if(any(i == probeBased)){
                tempNum <- as.numeric(keyMapped[i])
                names(tempNum) <- i
                probeMapped <- c(probeMapped, tempNum)
            }
        }
    }else{
        probeMapped <- NULL
        numMissMatch <- NULL
        srcNumProbes <- NULL
        numProbes <- NULL
        keyMissMatch <- NULL
    }
    statsList <- list(name = pkgName,
                      built = dateBuilt,
                      probeNum = srcNumProbes,
                      numMissMatch = numMissMatch,
                      probeMissMatch = keyMissMatch,
                      probeMapped = probeMapped,
                      otherMapped = otherMapped)
    QCList <- formatABQCList(statsList)
    if(saveList){
        assign(paste(pkgName, "QC", sep = ""), QCList)
        save(list = paste(pkgName, "QC", sep = ""),
             file = file.path(pkgPath, pkgName, "data",
             paste(pkgName, "QC.rda", sep = "")))
    }
    return(statsList)
}

formatABQCList <- function(x){
    QCList <- paste("\n\nQuality control information for ", x$name,
                    "\nDate built:", x$built,
                    ifelse(is.null(x$probeNum), "",
                           paste("\nNumber of probes:", x$probeNum)),
                    ifelse(is.null(x$numMissMatch), "",
                           paste("\nProbe number missmatch:",
                                 paste(x$numMissMatch, sep = "",
                                       collapse = "; "))),
                    ifelse(is.null(x$probeMissMatch), "",
                           paste("\nProbe missmatch:",
                                 paste(x$probeMissMatch, sep = "",
                                       collapse = "; "))),
                    ifelse(is.null(x$probeMapped), "",
                           paste("\nMappings found for ",
                                 "probe based rda files: \n",
                                 paste("\t", names(x$probeMapped),
                                       "found", x$probeMapped,
                                       "of", x$probeNum, sep = " ",
                                       collapse = "\n"),
                                 sep = "", collapse ="\n")),
                    "\nMappings found for non-probe based rda files:\n",
                    paste("\t", names(x$otherMapped), "found",
                          x$otherMapped, sep = " ", collapse = "\n"),"\n\n")
    return(QCList)
}

getDate <- function(pkgName, pkgPath, fromDesc = TRUE){
    if(fromDesc){
        fileName <- file.path(pkgPath, pkgName, "DESCRIPTION")
        if(!file.exists(fileName))
            stop(paste("DESCRIPTION file was not found in",
                       file.path(pkgPath, pkgName), "!"))

        tempFile <- readLines(fileName)

        for (i in tempFile){
            if(regexpr("Created:", i) > 0){
                return(gsub("^Created:(*)", "\\", i))
            }
        }
        return("Date built not found")
    }else{
        return(date())
    }
}

getProbeNum <- function(pkgName, pkgPath, noneNA = FALSE){
    toReturn <- NULL

    dataFolder <- file.path(pkgPath, pkgName, "data")
    rdaFiles <- getDirContent(dataFolder, paste(pkgName, "QC.rda", sep = ""))

    for(i in rdaFiles){
        temp <- unlist(strsplit(i, "\\."))
        load(file.path(dataFolder, i))
        if(!noneNA){
            num <- length(ls(get(temp[1])))
        }else{
            if(length(ls(get(temp[1]))) == 0){
                num <- 0
            }else{
                values <- multiget(ls(get(temp[1])), env = get(temp[1]))
                values <- values[!is.na(values)]
                values <- values[!is.null(values)]
                values <- values[values != ""]
                values <- values[values != "NA"]
                num <- length(values)
            }
        }
        names(num) <- temp[1]
        toReturn <- c(toReturn, num)
    }
    return(toReturn)
}

getDirContent <- function(dirName, exclude = NULL){
    if(is.null(exclude)){
        return(list.files(dirName))
    }else{
        temp <- list.files(dirName)
        index <- unlist(lapply(exclude, grep, temp))
        if(length(index) > 0){
            return(temp[-index])
        }else{
            return(temp)
        }
    }
}

matchProbes <- function(baseF, pkgName, pkgPath, toMatch, isFile = TRUE){

    toReturn <- "None"

    if(isFile){
        srcProbes <- read.delim(baseF, header = FALSE, sep = "\t")
    }else{
        srcProbes <- baseF
    }
    dataFolder <- file.path(pkgPath, pkgName, "data")
    rdaFiles <- list.files(dataFolder)

    for(i in rdaFiles){
        if(regexpr(".rda", i) > 0){
            name <- gsub(paste(pkgName,"(.*).rda", sep = ""),
                         "\\1", i)
            load(file.path(dataFolder, i))
            if(any(toMatch == name)){
                if(any(is.element(srcProbes[,1],
                                  ls(get(temp[1]))) == FALSE)){
                    if(toReturn == "None"){
                        toReturn <- NULL
                    }
                    toReturn <- c(toReturn, i)
                }
            }
        }
    }
    return(toReturn)
}

getPBased <- function(){

    probeBased <- NULL
    for(i in ls(AnnInfo)){
        if(get(i, AnnInfo)$pbased == "Y"){
            probeBased <- c(probeBased, i)
        }
    }
    return(probeBased)
}

# Returns a vector for the names of columns that have multiple entries
# separated by a ";" in some of the cells
getMultiColNames <- function(){
     return(c("PMID", "PATH", "ENZYME", "CHR", "UNIGENE", "HGID", "OMIM"))
}

getUniColNames <- function(){
    return(c("ACCNUM", "LOCUSID", "GENENAME", "SYMBOL", "MAP",
             "GRIF", "SUMFUNC", "NM", "NP"))
}

getTypeColNames <- function(){
    return(c("GO", "CHRLOC"))
}

#nameGOByCategory <- function(goids, goCategory, sep = ";"){
#    nameGOID <- function(goids){
#        if(is.null(goids) || is.na(goids)){
#            return(goids)
#        }
#        goids <- gsub("@[a-z]*", "", goids, TRUE)
#        goids <- unlist(strsplit(goids, sep), use.names = FALSE)
#        names(goids) <- goCategory[is.element(goCategory[,1], goids), 2]
#        return(goids)
#    }
#    goids <- lapply(goids, nameGOID)
#    goids[goids == "NA"] <- NA
#    return(goids)
#}

# This function writes the organism data
writeOrganism <- function(pkgName, pkgPath, organism){
    assign(paste(pkgName, "ORGANISM", sep = ""), organism)
    save(list = paste(pkgName, "ORGANISM", sep = ""),
             file =  file.path(pkgPath, pkgName, "data",
             paste(pkgName, "ORGANISM.rda", sep = "")))
}

# This function writes chromosome length data
writeChrLength <- function(pkgName, pkgPath, chrLengths){
    assign(paste(pkgName, "CHRLENGTHS", sep = ""), chrLengths)
    save(list = paste(pkgName, "CHRLENGTHS", sep = ""),
             file =  file.path(pkgPath, pkgName, "data",
             paste(pkgName, "CHRLENGTHS.rda", sep = "")))
}

# This function estimates the total lengths of chromosomes by finding
# out the maximum chromosome loaction and then increasing the number by 1000
findChrLength <- function(organism, srcUrl = getSrcUrl("GP", organism)){
    chrLengths <- vector()

    estimateLength <- function(chroNum){
        locs <- locatData[locatData[,1] == as.character(chroNum),]
        if(nrow(locs) == 0){
            temp <- NA
        }else if (nrow(locs) == 1){
            temp <- as.numberic(locs[, 2])
        }else {
            temp <- max(as.numeric(locs[, 2]))
        }
        chrLengths[as.character(chroNum)] <<- temp
    }
    locatData <- getGPData(paste(srcUrl, "refGene.txt.gz", sep = ""),
                           ncol = 10, keep = c(2,4))
    # Remove "chr" from chromosome numbers
    locatData[,1] <- gsub("^chr", "\\", locatData[, 1])
    chromosomes <- unique(locatData[,1])
    # Do numbered chromosomes first
    options(warn = -1)
    numbered <- chromosomes[!is.na(as.numeric(chromosomes))]
    options(warn = 0)
    tt <- sapply(sort(as.numeric(numbered)), estimateLength)
    # Do others
    switch(toupper(organism),
           MOUSE = ,
           RAT = ,
           HUMAN = sapply(c("X", "Y"), estimateLength),
           stop(paste("Do not know what to do with", organism)))
    # Remove entries with random locations
    # Add 1000 bases
    return(chrLengths + 1000)
}
# Constructors an object of GEO
#

GEO <- function(srcUrl = "http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?"){

    new("GEO", srcUrl = srcUrl)
}

# Query the GEO database. srcUrl gives the url for a common CGI scrips
# and GEOAccNum is the GEO accession number representing a file in the
# database
queryGEO <- function(GEOObj, GEOAccNum){

    srcURL <- paste(srcUrl(GEOObj), "acc=", GEOAccNum,
                     "&view=data&form=text&targ=self", sep = "")

    conn <- url(srcURL, open = "r")
    temp <- readLines(conn)
    close(conn)
    # Remove the header lines that come with the file and
    temp <- strsplit(temp[grep("\t", temp)], "\t")
    # Convert to a matrix
    temp <- t(sapply(temp, unlist))
    # The first row is for column name. Remove it.
    colnames(temp) <- temp[1,]
    return(temp[-1,])
}
# Constructors an object of GO
#

GO <- function(srcUrl = getSrcUrl("GO", xml = TRUE), parser = "",
               baseFile = ""){

    new("GO", srcUrl = srcUrl, parser = parser, baseFile = baseFile)
}

# The GO site provides an XML document for the molecular function,
# biological process, and cellular component of genes. The basic XML
# structure is something like:
# <go:term>
#   <go:accession>GO:000xxx</go:accession>
#   <go:name>a string for the function, process, or component</go:name>
#   <go:isa rdf:resource="http://www.geneontology.org/go#GO:000xxx/>
#   .
#   .
#   .
# <go:term>
#
# The XML document istself does not differentiats among the fuction,
# biological process, and cellular component of genes as a go:name tag
# is used for the fucntion, process, and component of genes. To
# determine whether a go:name tag is for the function, process, or
# component of a given gene idenfied by a GO accession number, the
# go:isa tag has to be used to move up a tree to find the correct
# category.
#
# For example, gene ontology has an accession number of GO:0003673,
# mocular function GO:0003674, cellular componenet GO:0005574, and
# biological process GO:0008150. The mocular function, cellular
# component, and biological process all have the tag, <go:isa
# rdf:resource="http://www.geneontology.org/go#GO:0003673/>,
# indicating that they all belong to gene ontology. Similarly, if a
# gene (say GO:0001284) has a tag  <go:isa
# rdf:resource="http://www.geneontology.org/go#GO:0008150/>, we know
# that the text inclosed by the go:name tag should be considered as a
# biological process. Often, the go:isa tag of a gene may not directly
# reference a top level GO accession (e. g. the number for mocular
# function, cellular component, or biological process) but to another
# gene (say GO:0001284), the ontology of that gene will be traced back
# to biological process through the reference to GO:0008150 by gene
# GO:001284.
#
# The GOXMLParser reads in the go.xml document from Gene Ontology site,
# creates a table named gooriginal with three columns (GOID,
# ParentGOID, and Ontology) in the database, and then populates the
# table with data parsed from the XML document. Both the GOID and
# ParentGOID are keys for the table as a given GOID may have multiple
# parent GOIDs.The handlers used currently may need to be modified
# accordingly if Gene Ontology site changes the DTD of the XML document.
#
# Copyright 2001 J. Zhang, all rights reserved.
#
GOXMLParser <- function(fileName) {
    # Flags that will be checked by function text
    isGOID <- FALSE
    isOnto <- FALSE
    # Keeps a temporary value
    GOID <- NA
    parentID <- NA
    ontology <- NA
    association <- NA
    evidence <- NA
    # Keep the desired data
    GOData <- NULL
    # Reset the values when done with a go term
    reset <- function(){
        GOID <<- NA
        parentID <<- NA
        ontology <<- NA
        association <<- NA
        evidence <<- NA
    }
    # Get the GO id or ontology for an element
    getText <- function(value){
        if(isGOID){
            GOID <<- value
            isGOID <<- FALSE
        }else if(isOnto){
            ontology <<- value
            isOnto <<- FALSE
        }
    }
    # Parent GO id of an element is given as an attribute that can be
    # get this way
    getParentGOID <- function(attrs, assoType){
        if(is.na(parentID)){
            association <<- paste(parseIsa(attrs), assoType, sep = "@")
            parentID <<- parseIsa(attrs)
        }else{
            parentID <<- paste(parentID, parseIsa(attrs), sep = ";")
            names(assoType) <- parseIsa(attrs)
            association <<- paste(association,
                              paste(parseIsa(attrs), assoType, sep = "@"),
                                    sep = ";")
        }
    }
    # parse the isa tag and get only the GO accession number
    parseIsa <- function(isaTag){
        return(gsub(".*http.*#(.*)", "\\1", isaTag))
    }
    # Merge the parsed data
    mergeData <- function(){
        if(is.null(GOData)){
            GOData <<- c(GOID, parentID, ontology, association)
        }else{
            GOData <<- rbind(GOData, c(GOID, parentID, ontology, association))
        }
    }
    # Event handlers for the parser
    getHandler <- function(){
        # Gets the value for desired element
        startElement <- function(name, attrs){
            switch(name,
                   "go:accession" = isGOID <<- TRUE,
                   "go:name" = isOnto <<- TRUE,
                   "go:part_of" = getParentGOID(attrs, "partof"),
                   "go:is_a" = getParentGOID(attrs, "isa"))
        }
        # Write the data when entries for a GO term ends
        endElement <- function(name,...){
            if(name == "go:term"){
                mergeData()
                reset()
            }
        }
        text <- function(x){
            getText(x)
        }

        list(startElement = startElement, endElement = endElement,
             text = text)
    }
    xmlEventParse(fileName, handlers = getHandler())
    colnames(GOData) <- c("goid", "parents", "ontology", "association")
    # Remove GO:isa and GO:part-of and return
    return(GOData[!is.element(GOData[,1], c("GO:isa", "GO:partof")), ])
}

getChildNodes <- function(goid, goData){
    return(goData[grep(goid, goData[, 2]), 1])
}

getOffspringNodes <- function(goid, goData, keepTree = FALSE){

    offspring <- NULL

    getChildren <- function(goid){
#        print(goData[goData[,1] == goid,])
        return(goData[grep(goid, goData[, 2]), 1])
#        return(getChildNodes(goData, goid))
    }

    repeat{
        temp <- unique(unlist(sapply(goid, getChildren), use.names = FALSE))
        if(length(temp) > 0){
            goid <- temp
            if(is.null(offspring)){
                offspring <- unique(temp)
            }else{
                offspring <- unique(c(offspring, temp))
            }
        }else{
            break
        }
    }
    return(offspring)
}

getParentNodes <- function(goid, goData, sep = ";"){
    return(unlist(strsplit(goData[(goData[,1] == goid), 2], sep),
                  use.names = FALSE))
}

getAncestors <- function(goid, goData, sep = ";", keepTree = FALSE,
                         top = "GO:0003673"){
    ancestors <- NULL
    getAncestor <- function(goid){
         return(unlist(strsplit(goData[(goData[,1] == goid), 2], sep),
                  use.names = FALSE))
#        return(getParentNodes(goData, goid, sep = sep))
    }

    repeat{
        temp <- unique(unlist(sapply(goid, getAncestor), use.names = FALSE))
        temp <- temp[temp != top]
        if(length(temp) > 0){
            goid <- temp
            if(is.null(ancestors)){
                if(keepTree){
                    ancestors <- list(1, unique(temp))
                }else{
                    ancestors <- unique(temp)
                }
            }else{
                if(keepTree){
                    ancestors[[length(ancestors) + 1]] <- unique(temp)
                }else{
                    ancestors <- unique(c(ancestors, temp))
                }
            }
        }else{
            if(keepTree){
                ancestors[[length(ancestors) + 1]] <- top
            }else{
                ancestors <- unique(c(ancestors, top))
            }
            break
        }
    }
    return(ancestors)
}

getTopGOid <- function(what = c("MF", "BP", "CC", "GO")){
    what = match.arg(what)
    switch(what,
           MF = return("GO:0003674"),
           BP = return("GO:0008150"),
           CC = return("GO:0005575"),
           GO = return("GO:0003673"),
           stop("Invalid argument imput!"))
}

mapGO2Category <- function(goData){
    goNCategory <- cbind(getTopGOid("GO"), "GO")
    goGroups <- getGOGroupIDs()
    for(i in names(goGroups)){
        goids <- c(as.vector(goGroups[i]),
                   getOffspringNodes(goGroups[i], goData))
        # Map GO ids to the three ontology categories.
        goNCategory <- rbind(goNCategory, cbind(goids, i))
    }
    return(goNCategory)
}

getGOGroupIDs <- function(onto = FALSE){
    if(!onto){
        return(c(MF = "GO:0003674", CC = "GO:0005575", BP = "GO:0008150"))
    }else{
        return("GO:0003673")
    }
}









# Functions that build a data package using GO data
#

GOPkgBuilder <- function(pkgName = "GO", pkgPath, version = "1.2.1",
                         srcUrl = getSrcUrl("GO", xml = TRUE),
                         author = list(author = "who",
                         maintainer = "who@email.com")){
    print("It may take me a while to process the data. Be patient!")
    if(missing(pkgPath)){
        pkgPath <- getwd()
    }
    # A matrix to keep GO ids and group (CC, BP, MF) information
    goNGroup <- cbind(getTopGOid("GO"), "GO")
    makeSrcInfo()
    go <- GO(srcUrl)
    goData <- readData(go, xml = TRUE)
    # Create a data package with no data
    createEmptyDPkg(pkgName, pkgPath, force = TRUE)
    # Write the association data
#    env <- new.env(hash = TRUE, parent = NULL)
#    multiassign(goData[,1], splitChromData(goData[, 4]), env)
#    assign(paste(pkgName, "ASSOCIATION", sep = ""), env)
#    save(list = paste(pkgName, "ASSOCIATION", sep = ""),
#         file =  file.path(pkgPath, pkgName, "data",
#         paste(pkgName, "ASSOCIATION", ".rda", sep = "")))
    # Process data for MF, CC, and BP and write the data to parents
    # and ontology term data file
    goGroups <- getGOGroupIDs()
    for(i in names(goGroups)){
        goids <- c(as.vector(goGroups[i]),
                   getOffspringNodes(goGroups[i], goData))
        # Map GO ids to the three ontology categories.
        goNGroup <- rbind(goNGroup, cbind(goids, i))
        temp <- goData[is.element(unique(goData[,1]), goids),]
        # Write GO id to parent GO id data file
        env <- new.env(hash = TRUE, parent = NULL)
        multiassign(temp[,1], sapply(temp[, 4], twoStepSplit), env)
        assign(paste(pkgName, i, "PARENTS", sep = ""), env)
        save(list = paste(pkgName, i, "PARENTS", sep = ""),
             file =  file.path(pkgPath, pkgName, "data",
             paste(pkgName, i, "PARENTS", ".rda", sep = "")))
        # Write Ontology file
        env <- new.env(hash = TRUE, parent = NULL)
        tempTerm <- goData[is.element(unique(goData[,1]),
                                      c(getTopGOid("GO"), goids)),]
        multiassign(temp[,1], sapply(temp[, 3], splitEntry), env)
        assign(paste(pkgName, i, "ID2TERM", sep = ""), env)
        save(list = paste(pkgName, i, "ID2TERM", sep = ""),
             file =  file.path(pkgPath, pkgName, "data",
             paste(pkgName, i, "ID2TERM", ".rda", sep = "")))
        # Write GO id to children GO id data file
        temp <- sapply(goids, getChildNodes, goData = goData)
        temp <- sapply(temp, paste, sep = "", collapse = ";")
        # Write GO id to parent GO id data file
        env <- new.env(hash = TRUE, parent = NULL)
        multiassign(names(temp), sapply(temp, splitEntry), env)
        assign(paste(pkgName, i, "CHILDREN", sep = ""), env)
        save(list = paste(pkgName, i, "CHILDREN", sep = ""),
             file =  file.path(pkgPath, pkgName, "data",
             paste(pkgName, i, "CHILDREN", ".rda", sep = "")))
    }
    # Write the mapping between GO ids and category information
    env <- new.env(hash = TRUE, parent = NULL)
    multiassign(goNGroup[,1], goNGroup[,2], env)
    assign(paste(pkgName, "CATEGORY", sep = ""), env)
    save(list = paste(pkgName, "CATEGORY", sep = ""),
         file =  file.path(pkgPath, pkgName, "data",
         paste(pkgName, "CATEGORY", ".rda", sep = "")))
    # Write man pages and other files
    writeAccessory(pkgName, pkgPath, organism = "GO",
                   version = version, author = author)
    # Quality control
    getDPStats("", pkgName, pkgPath)
    writeMan4QC(pkgName, pkgPath)
}













# Constructs an object of GP for Golden Path data
#

GP <- function(srcUrl = getSrcUrl("UCSC", "human"), organism = "human",
               parser = "", baseFile = ""){

    new("GP", srcUrl = srcUrl, organism = organism, parser = parser,
        baseFile = baseFile)
}

getChroLocation <- function(srcUrl, exten = gpLinkNGene(), sep = "\t",
                            fromWeb = TRUE, raw = FALSE){
    if(fromWeb){
        # Tow source data files are needed.
        linkData <- getGPData(paste(srcUrl, exten["link"], sep = ""),
                              sep = "\t", ncol = 8, keep = c(3,7))
        locatData <- getGPData(paste(srcUrl, exten["gene"], sep = ""),
                               ncol = 10, keep = c(1:5))
    }else{
        linkData <- read.table(srcUrl["link"], sep = sep, header = FALSE,
                               as.is = TRUE)
        linkData <- linkData[, c(3, 7)]
        locatData <- read.table(srcUrl["gene"], sep = sep, header = FALSE,
                                as.is = TRUE)
        locatData <- locatData[, c(1:4)]
    }
    colnames(linkData) <- c("ID", "LOCUSID")
    # Remove "chr" proceeding chromosome number
    locatData[, 2] <- gsub("^chr", "\\", locatData[, 2])
    if(raw){
        colnames(locatData) <- c("ID", "CHR", "STRAND", "START", "END")
        merged <- as.matrix(merge(linkData, locatData, by = "ID",
                                  all.x = TRUE))
        merged <- merged[, 2:6]
        # Remove <NA>s that are introduced by droping the ID column
        merged <- merged[!is.na(merged[,1]), ]
        return(merged)
    }
    # for - strand, end locations should be strat locations
    locatData <- rbind(locatData[locatData[,3] == "+", c(1:4)],
                       locatData[locatData[,3] == "-", c(1, 2, 3, 5)])
    # Put sign to locations
    locatData[, 4] <- paste(locatData[,3], locatData[,4], sep = "")
    locatData <- cbind(locatData[, 1], paste(locatData[, 4], "@",
                 locatData[,2], sep = ""))
    colnames(locatData) <- c("ID", "CHRLOC")
    merged <- as.matrix(merge(linkData, locatData, by = "ID",
                              all.x = TRUE)[, 2:3])
    # Remove <NA>s that are introduced by droping the ID column
    merged <- merged[!is.na(merged[,1]), ]
    return(mergeRowByKey(merged))
}

getGPData <- function(srcUrl, sep = "\t", ncol = 8, keep = c(3,7)){
    temp <- loadFromUrl(srcUrl)
    gpData <- matrix(scan(temp, what = "", sep = sep, quote = "",
                          strip.white = TRUE, quiet = TRUE), ncol = ncol,
                     byrow = TRUE)[,keep]
    unlink(temp)
    return(gpData)
}

gpLinkNGene <- function(test = FALSE){
    if(test){
        return(c(link = "Tlink.txt", gene = "TGene.txt"))
    }else{
        return(c(link = "refLink.txt.gz", gene = "refGene.txt.gz"))
    }
}

# No in use at this time
#gpParser <- function(){
#    c(link = file.path(.path.package("pubRepo"), "data", "gpLinkParser"),
#      gene = file.path(.path.package("pubRepo"), "data", "gpGeneParser"))
#}

# Constructors an object of KEGG
#

KEGG <- function(srcUrl = "ftp://ftp.genome.ad.jp/pub/kegg/pathways",
                 organism = "human", parser = "", baseFile = ""){

    new("KEGG", srcUrl = srcUrl, organism = organism, parser = parser,
        baseFile = baseFile)
}

getKEGGIDNName <- function(srcUrl, exten = "/map_title.tab" ){
    con <- url(paste(srcUrl, exten, sep = ""), "r")
    temp <- matrix(unlist(strsplit(readLines(con), "\t")),
                                      ncol = 2, byrow = TRUE)
    close(con)
    idNName <- temp[,2]
    names(idNName) <- temp[,1]
    return(idNName)
}

getKEGGOrgName <- function(name){
    switch(name,
           "human" = return("hsa"),
           "mouse" = return("mmu"),
           "rat" = return("rno"),
           "yeast" = return("sce"),
           stop("The organism is not supported"))
}

getLLPathMap <- function(srcUrl, idNName, organism){
    llEC <- NULL
    llPathName <- NULL
    # Get ll to EC and pathway name mappings
    for(i in names(idNName)){

        temp <- mapll2EC(i, srcUrl = srcUrl, organism)
        if(!is.null(temp)){
            temp <- t(sapply(temp, parseEC))
            # Map ll to EC
            if(is.null(llEC)){
                llEC <- temp[!is.na(temp[,2]),]
            }else{
                llEC <- rbind(llEC, temp[!is.na(temp[,2]),] )
            }
            # Map ll to pathway name
            if(length(unique(temp[,1])) <= 1){
                tempPN <- matrix(c(unique(temp[,1]), i), ncol = 2)
            }else{
                tempPN <- cbind(unique(temp[, 1]), i)
            }
            if(is.null(llPathName)){
                llPathName <- tempPN
            }else{
                llPathName <- rbind(llPathName, tempPN)
            }
        }
    }
    colnames(llEC) <- c("LOCUSID", "ENZYME")
    colnames(llPathName) <- c("LOCUSID", "PATH")
    return(list(llec = mergeRowByKey(llEC),
                llpathname = mergeRowByKey(llPathName)))
}


# Returns a matrix with Locus link ids as the first column and EC as
# second one with NA for ll not mapped to EC
mapll2EC <- function(id, srcUrl, organism, sep = "\t"){
    tempURL <- paste(srcUrl,"/", getKEGGOrgName(organism),
                     "/", getKEGGOrgName(organism), as.character(id),
                     ".gene", sep = "")

    on.exit(options(show.error.messages = TRUE))

    temp <- NULL
    con <- url(tempURL)
    options(show.error.messages = FALSE)
    llNEC <- try(readLines(con))
    close(con)
    options(show.error.messages = TRUE)

    if(!inherits(llNEC, "try-error")){
        temp <- strsplit(llNEC, split = sep)
#        temp <- t(sapply(temp, parseEC))
#        temp <- temp[!is.na(temp[,2]),]
        return(temp)
    }else{
        return(NULL)
    }
}

parseEC <- function(llNEC){
    if(length(grep(".*\\[EC:[0-9|\\.|-| ]*", llNEC[2])) != 0){
        EC <- strsplit(strsplit(llNEC[2], "\\[EC:")[[1]][2], "\\]")[[1]][1]
        # Collapse multiple EC number
        EC <- paste(unlist(strsplit(EC, " "), use.names = FALSE),
                    collapse = ";")

        return(c(llNEC[1], EC))
    }else{
        return(c(llNEC[1], NA))
    }
}













# This function creates a data package containing mappings between
# KEGG pathway names and pathway ids and enzyme ids and GO ids.
#
# Copyright 2002, J. Zhang. All rights reserved.
#

KEGGPkgBuilder <- function(pkgPath, pkgName = "KEGG", version = "1.0.1",
                         pathwayURL = getKEGGFile("path"),
                         enzymeURL = getKEGGFile("enzyme"),
                         force = TRUE, author = list(author = "who",
                         maintainer = "who@email.com")){
    # Create two environments to hold the parsed data
    pNameEnv <- new.env(hash = TRUE, parent = NULL)
    pIdEnv <- new.env(hash = TRUE, parent = NULL)
    makeSrcInfo()
    createEmptyDPkg(pkgName, pkgPath)
    # Write mappings between path ids and path names
    idNName <- getKEGGIDNName(pathwayURL, "")
    multiassign(idNName, names(idNName), env = pNameEnv)
    multiassign(names(idNName), idNName, env = pIdEnv)
    assign(paste(pkgName, "PATHNAME2ID", sep = ""), pNameEnv)
    assign(paste(pkgName, "PATHID2NAME", sep = ""), pIdEnv)
    save(list = paste(pkgName, "PATHNAME2ID", sep = ""),
         file =  file.path(pkgPath, pkgName, "data",
         paste(pkgName, "PATHNAME2ID.rda", sep = "")))
    save(list = paste(pkgName, "PATHID2NAME", sep = ""),
         file =  file.path(pkgPath, pkgName, "data",
         paste(pkgName, "PATHID2NAME.rda", sep = "")))
    # Writes mappings between GO and enzyme
    enzymeIdNGO <- getEIdNName(enzymeURL)
    enzymeGOEnv <- new.env(hash = TRUE, parent = NULL)
    goEnzymeEnv <- new.env(hash = TRUE, parent = NULL)
    multiassign(enzymeIdNGO[,1], enzymeIdNGO[,2], env = enzymeGOEnv)
    multiassign(enzymeIdNGO[,2], enzymeIdNGO[,1], env = goEnzymeEnv)
    assign(paste(pkgName, "ENZYMEID2GO", sep = ""), enzymeGOEnv)
    assign(paste(pkgName, "GO2ENZYMEID", sep = ""), goEnzymeEnv)
    save(list = paste(pkgName, "ENZYMEID2GO", sep = ""),
         file =   file.path(pkgPath, pkgName, "data",
         paste(pkgName, "ENZYMEID2GO.rda", sep = "")))
    save(list = paste(pkgName, "GO2ENZYMEID", sep = ""),
         file =   file.path(pkgPath, pkgName, "data",
         paste(pkgName, "GO2ENZYMEID.rda", sep = "")))
    # Write man pages and other files
    writeAccessory(pkgName, pkgPath, organism = "KEGG",
                   version = version, author = author)
    writeDescription(pkgName, pkgPath, version, author)
    # Write Rd files for all the rda files that already exist in data
    for(i in getAllRdaName(pkgName, pkgPath)){
        writeManPage(pkgName, pkgPath, i,
                     src = get(i, AnnInfo)$src)
    }
    # Write a function that displays data information when called
    # using pkgName() and the man page for the function
    writeFun (pkgPath, pkgName)
    writeMan4Fun(pkgName, pkgPath)
    # Write .First.lib
    writeZZZ(pkgPath, pkgName)
    # Quality control
    getDPStats("", pkgName, pkgPath)
    writeMan4QC(pkgName, pkgPath)
}

getEIdNName <- function(enzymeURL =
                        "http://www.geneontology.org/external2go/ec2go"){

    kegg <- KEGG(srcUrl = enzymeURL)
    fileGot <- readData(kegg)

    getECNGO <- function(vect){
        return(c(vect[1], vect[length(vect)]))
    }
    # Remove the header if any
    if(length(grep("^!", fileGot)) > 0){
        fileGot <- fileGot[-grep("^!", fileGot)]
    }
    # Split entries using a space
    fileGot <- sapply(sapply(fileGot, strsplit, " "), unlist)
    # Create a matrix using the splited data
    fileGot <- matrix(sapply(fileGot, getECNGO), ncol = 2, byrow = TRUE)

    return(fileGot)
}


getKEGGFile <- function(whichOne){
    switch(tolower(whichOne),
           "path" = return(paste(getSrcUrl("KEGG"), "/",
                    "map_title.tab", sep = "")),
           "enzyme" = return("http://www.geneontology.org/external2go/ec2go"),
           stop("Name has to be eith path or enzyme!"))
}





# Constructs an object of LL
#

LL <- function(srcUrl = getSrcUrl("LL"),
               parser = file.path(.path.package("AnnBuilder"), "data",
                                "gbLLParser"), baseFile = ""){

    new("LL", srcUrl = srcUrl, parser = parser, baseFile = baseFile)
}

# This function parses a source file based on a given set of
# parsing instrucitons passed as "parser" and "baseFile".
#
# Copyright 2002. J. Zhang all rights reserved.
#

fileMuncher <- function(outName, baseFile, dataFile, parser, isDir = FALSE){

    OS <- .Platform$OS.type
    temp <- paste(basename(tempfile("tempPerl")),".pl", sep = "")
    perlName <- file.path(.path.package("AnnBuilder"), "temp", temp)

    writePerl <- function(toWrite){
        write(toWrite, file = perlName, append = TRUE)
    }

    if(!file.create(perlName))
        stop(paste("You do not have write permission to the ",
                   "directory for the Perl script createded", sep = ""))
    if(OS == "unix"){
        perlBin <- system("which perl", intern = TRUE)
        if(length(perlBin) > 1){
            stop("Perl is not available!")
        }
        writePerl(paste("#!", perlBin, "\n\n", sep = ""))
    }else if(OS == "windows"){
        writePerl("#!/usr/bin/perl -w\n\n")
    }

    if(baseFile != "" && !is.null(baseFile) && !is.na(baseFile)){
        statement <- paste("open(BASE, \"<", baseFile, "\") || die ",
                       "\"Can not open ", baseFile, "\";\n", sep = "")
        writePerl(statement)
    }

    if(dataFile != "" && !is.null(dataFile) && !is.na(dataFile)){
        if(isDir){
            statement <- paste("opendir(DIR, \"", dataFile, "\") || die ",
                           "\"Can not open director ",dataFile, "\";\n",
                           sep = "")
            writePerl(statement)
        }else{
            statement <- paste("open(DATA, \"<", dataFile, "\") || die ",
                       "\"Can not open ", dataFile, "\";\n", sep = "")
            writePerl(statement)
        }
    }

    statement <- paste("open(OUT, \">", outName, "\") || die ",
                       "\"Can not open ", outName, "\";\n\n", sep = "")
    writePerl(statement)
    if(isDir)
        writePerl(paste("$PATH = \"", dataFile, "\";", sep = ""))

    if(!is.null(parser))
        writePerl(readLines(parser))

    .callPerl(perlName, OS)
    unlink(perlName)
    return (outName)
}

.callPerl <- function(script, os){
    if(os == "unix"){
        system(paste("chmod +x", script))
        system(script)
    }else if(os == "windows"){
        script <- gsub("/", "\\\\", script)
        system(paste("perl", script))
    }else{
        stop(paste("Do not know who to run perl under ", os))
    }
}








SPPkgBuilder <- function(pkgPath, version, author, fromWeb = TRUE,
  url = "ftp://ftp.ebi.ac.uk/pub/databases/swissprot/release/sprot41.dat"){

    dataSrc <- paste("Swiss-Prot (\url{", url, "})", sep = "")
    # Just use LL to create a Swiss-Prot object
    sp <- LL(srcUrl = url, parser = file.path(.path.package("AnnBuilder"),
                           "data", "SPParser"))
    spData <- as.matrix(parseData(sp, ncol = 25, fromWeb = fromWeb))
    colnames(spData) <- getEnvNames()
    spData <- spData[2:nrow(spData),]
    spData[spData == "NA"] <- NA
    # Write AnnInfo to .GlobalEnv to make Annotation information available
    makeSrcInfo()
    for(i in c("human", "mouse", "rat")){
        pkgName <- paste(i, "SP", sep = "")
        tempData <- spData[spData[, "ORGANISM"] == toupper(i),]
        # Create a data package with no data
        createEmptyDPkg(pkgName, pkgPath, force = TRUE)
        # Write the template for man pages
        writeManPage(pkgName, pkgPath, "#ONETOONEMAN#", "",
                     src = get("#ONETOONEMAN#", AnnInfo)$src)
        writeManPage(pkgName, pkgPath, "#ONETOMANYMAN#", "",
                     src = get("#ONETOMANYMAN#", AnnInfo)$src)
        for(j in colnames(tempData)[!is.element(colnames(tempData),
                                                c("AC", "ORGANISM"))]){
            if(isOneToOne(j)){
                # AC may have multiple values. Collapse before writing
                saveData2Env(getReverseMapping(cbind(tempData[, j],
                                   gsub(" +", "", tempData[, "AC"]))),
                             fun = function(x) x, pkgName, pkgPath, j)
                copySubstitute(file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#ONETOONEMAN#.Rd", sep = "")),
                           file.path(pkgPath, pkgName,
                           "man", paste(pkgName, j, ".Rd", sep = "")),
                   list(ONETOONEMAN = j,
                        VALUE = tolower(getDetailV(j)),
                        KEY = "Swiss-Prot accession number",
                        ENVNAME = paste(pkgName, j, sep = ""),
                        DATASOURCE = dataSrc, ), "#")
            }else{
                saveData2Env(getReverseMapping(cbind(
                                   gsub(" +", "", tempData[, j]),
                             gsub(" +", "", tempData[, "AC"]))),
                             twoStepSplit,
                             pkgName, pkgPath, j)
                copySubstitute(file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#ONETOMANYMAN#.Rd", sep = "")),
                           file.path(pkgPath, pkgName,
                           "man", paste(pkgName, j, ".Rd", sep = "")),
                   list(ONETOMANYMAN = j,
                        VALUE = tolower(getDetailV(j)),
                        KEY = "protein accession number",
                        ENVNAME = paste(pkgName, j, sep = ""),
                        DATASOURCE = dataSrc, ), "#")
            }
        }
        # Do reverse mapping between MIM ids and accession number
        AC2MIM <- tempData[!is.na(tempData[, "MIM"]), c("AC", "MIM")]
        AC2MIM <- cbind(gsub(" +", "", AC2MIM[,1]), AC2MIM[,2])
        saveData2Env(getReverseMapping(AC2MIM),
                              twoStepSplit, pkgName, pkgPath, "MIM2AC")
        copySubstitute(file.path(pkgPath, pkgName, "man",
                       paste(pkgName, "#ONETOMANYMAN#.Rd", sep = "")),
                       file.path(pkgPath, pkgName, "man",
                              paste(pkgName, "MIM2AC", ".Rd", sep = "")),
                       list(ONETOMANYMAN = "MIM2AC",
                            VALUE = "Swess-Prot protein accession number",
                            KEY = "MIM id",
                            ENVNAME = paste(pkgName, "MIM2AC", sep = ""),
                            DATASOURCE = dataSrc, ), "#")
        # Write man pages and so on
        writeDescription(pkgName, pkgPath, version, author,
                         dataSrc = "NCBI")
        writeFun (pkgPath, pkgName, organism = organism)
        writeMan4Fun(pkgName, pkgPath)
        # Write .First.lib
        writeZZZ(pkgPath, pkgName)
        # Write the quality control data
        getDPStats("", pkgName, pkgPath)
        writeMan4QC(pkgName, pkgPath)
        file.remove(file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#ONETOONEMAN#.Rd", sep = "")))
        file.remove(file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#ONETOMANYMAN#.Rd", sep = "")))
    }
}

getDetailV <- function(key){
    values <- c("Swiss-Prot accession number","protein name",
                "specise name", "type of molecule",
                "total number of aminoacids", "PubMed id",
                paste("description of related protein sequence(s)",
                      "produced by alternative splicing or initiation",
                      "codons"),
                "description of reactions catalyzed",
                "possible errors and/or grounds for confusion",
                "an enzyme cofactor",
                paste("description of developmentally specific",
                      "expression of a protein"),
                paste("description of the disease(s) associated with",
                      "a deficiency of a protein"),
                "description of the domain structure of a protein",
                "description of an enzyme regulatory mechanism",
                "description of the function(s) of a protein",
                paste("description of the compound(s) which stimulate",
                      "the synthesis of a protein"),
                paste("weight of a protein or part of a protein as",
                      "determined by mass spectrometric methods"),
                paste("descriptionof the use of a protein as a",
                      "pharmaceutical drug"),
                "description of polymorphism(s)",
                "description of a posttranslational modification",
                paste("description of the similarities(s) of a",
                      "protein with orther proteins"),
                paste("description of the subcellular lacation of",
                      "the mature protein"),
                "desction of the quaternary structure of a protein",
                "description of the tissue specificity of a protein",
                "MIM id")
    names(values) <- getEnvNames()
    return(values[key])
}

getEnvNames <- function(){
    return(toupper(c("ac", "name", "organism", "type", "length", "pmid",
               "AP", "CA", "caution", "cofactor", "DS", "disease",
               "domain", "ER", "function", "induction", "MS", "pharm",
               "polym", "PTM", "similarity", "SL", "subunit", "TS","MIM")))
}

isOneToOne <- function(envName){
    if(any(envName == c("PMID"))){
        return(FALSE)
    }else{
        return(TRUE)
    }
}

# Constructs an object of UG
#

UG <- function(srcUrl = getSrcUrl("UG", "human"),
               parser = file.path(.path.package("AnnBuilder"), "data",
                                                    "gbUGParser"),
               baseFile = "", organism = "human"){

    new("UG", srcUrl = srcUrl, parser = parser, baseFile = baseFile,
        organism = organism)
}



# Building a data package containing location data for genes on
# chromosomes
chrLocPkgBuilder <- function(pkgName = "humanCHRLOC", pkgPath, version,
                             author, organism = "human",
                             url = getSrcUrl("gp", "human")){

    chrLoc <- getChroLocation(url, raw = TRUE)
    suffix <- gsub(".*_random", "Unconfident", chrLoc[,2])
    suffix[suffix != "Unconfident" | is.na(suffix)] <- "Confident"
    chrLoc <- cbind(chrLoc[,1], gsub("_random", "", chrLoc[,2]),
                    chrLoc[,3], paste(chrLoc[, 4], suffix, sep = "@"),
                    paste(chrLoc[, 5], suffix, sep = "@"))
#    chrLength <- findChrLength(organism)
    makeSrcInfo()
    createEmptyDPkg(pkgName, pkgPath)
    # Write the mappings between LL and chromosome number
    ll2Chr <- mergeRowByKey(chrLoc[, 1:2])
    env <- new.env(hash = TRUE, parent = NULL)
    multiassign(ll2Chr[,1], lapply(ll2Chr[,2], splitEntry), env)
    assign(paste(pkgName, "LOCUSID2CHR", sep = ""), env)
    save(list = paste(pkgName, "LOCUSID2CHR", sep = ""),
         file =  file.path(pkgPath, pkgName, "data",
         paste(pkgName, "LOCUSID2CHR.rda", sep = "")))
    writeManPage(pkgName, pkgPath, "LOCUSID2CHR",
                         src = get("LOCUSID2CHR", AnnInfo)$src)
    # Write the tamplates for man pages
    writeManPage(pkgName, pkgPath, "#CHRNUM#START", "human",
                         src = get("#CHRNUM#START", AnnInfo)$src)
    writeManPage(pkgName, pkgPath, "#CHRNUM#END", "human",
                         src = get("#CHRNUM#END", AnnInfo)$src)
    for(i in getChrNum(unique(chrLoc[,2]))){
        chr4Org <- chrLoc[chrLoc[,2] == i,]
        # Remove <NA>s introduced by the above manipulation
        chr4Org <- chr4Org[!is.na(chr4Org[,1]), ]
        # For -strand start should be end and end should be start
        chr4Org <- rbind(chr4Org[chr4Org[, 3] == "+", c(1, 4, 5)],
                         chr4Org[chr4Org[, 3] == "-", c(1, 5, 4)] )

        # Collapse values for LocusLink ids
        chr4Org <- mergeRowByKey(chr4Org)
        env <- new.env(hash = TRUE, parent = NULL)
        multiassign(chr4Org[, 1], lapply(chr4Org[, 2],
                                       twoStepSplit, asNumeric = TRUE), env)
        assign(paste(pkgName, i, "START", sep = ""), env)
        save(list = paste(pkgName, i, "START", sep = ""),
             file =  file.path(pkgPath, pkgName, "data",
             paste(pkgName, i, "START.rda", sep = "")))
        env <- new.env(hash = TRUE, parent = NULL)
        multiassign(chr4Org[, 1],
                        lapply(chr4Org[, 3], twoStepSplit,
                               asNumeric = TRUE), env)
        assign(paste(pkgName, i, "END", sep = ""), env)
        save(list = paste(pkgName, i, "END", sep = ""),
             file =  file.path(pkgPath, pkgName, "data",
             paste(pkgName, i, "END.rda", sep = "")))
        copySubstitute(file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#CHRNUM#START.Rd", sep = "")),
                           file.path(pkgPath, pkgName,
                           "man", paste(pkgName, i, "START.Rd",
                                        sep = "")), list(CHRNUM = i), "#")
        copySubstitute(file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#CHRNUM#END.Rd", sep = "")),
                           file.path(pkgPath, pkgName,
                           "man", paste(pkgName, i, "END.Rd",
                                        sep = "")), list(CHRNUM = i), "#")
    }
     # Remove man page templates
    file.remove(file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#CHRNUM#START.Rd", sep = "")),
                file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#CHRNUM#END.Rd", sep = "")))
    writeDescription(pkgName, pkgPath, version, author)
    # using pkgName() and the man page for the function
    writeFun (pkgPath, pkgName, organism = organism)
    writeMan4Fun(pkgName, pkgPath)
    # Write .First.lib
    writeZZZ(pkgPath, pkgName)
    # Write the quality control data
    # Quality control
    getDPStats("", pkgName, pkgPath)
    writeMan4QC(pkgName, pkgPath)
#    writeOrganism(pkgName, pkgPath, organism)
#    writeManPage(pkgName, pkgPath, "ORGANISM",
#                         src = get("ORGANISM", AnnInfo)$src)
}
# Chromosome number may have _random extensions for genes not too sure
# about their locations. Remove them
getChrNum <- function(chr){
    options(warn = -1)
    keep <- chr[!is.na(as.numeric(chr))]
    options(warn = 1)
    keep <- c(keep, chr[is.element(chr, c("X", "Y"))])
    return(keep)
}

# This function takes two data columns from a data file (e. g. two
# columns) and generates an environment object with
# values in one of the columns as keys and values in the other as
# values. A given row in either column may have multiple values
# separated by a separator(e. g. "a;b").
#
# Copyright 2002, J. Zhang, all rights reserved
#

cols2Env <- function(cols, colNames, keyColName = colNames[1], sep = ";"){

    # Environment to return
    dataEnv <- new.env(hash = TRUE, parent = NULL)

    # Force a matrix
    cols <- as.matrix(cols)
    colnames(cols) <- colNames

    if(ncol(cols) != 2){
        stop("The input data do not have two columns")
    }
    # Determine the column name
    valColName <- colNames[colNames != keyColName]
    # Match cols by finding matches for all the potential multiple
    # matches
    cols <- matchAll(cols, keyColName)
    colnames(cols) <- colNames
    #Get unduplicated row
    unDups <- cols[!duplicated(cols[,keyColName]),]
    colnames(unDups) <- colNames
    # Write unique keys and values
    multiassign(unDups[, keyColName],
                lapply(unDups[, valColName], splitEntry, sep = sep), dataEnv)
    # Process the duplications
    dups <- cols[duplicated(cols[,keyColName]),]
    colnames(dups) <- colNames
    if(nrow(dups) > 0){
        for(i in 1:nrow(dups)){
            assign(dups[i,keyColName], c(get(dups[i,
                         keyColName], dataEnv), unique(dups[i,valColName])),
                   dataEnv)
        }
    }

    return(dataEnv)
}

matchAll <- function(cols, keyColName){
    matched <- NULL

    for(i in 1:nrow(cols)){
        temp <- matchOneRow(cols[i,], keyColName)
        if(!is.null(temp)){
            if(is.null(matched)){
                matched <- temp
            }else{
                matched <- rbind(matched, temp)
            }
        }
    }
    return(matched)
}

matchOneRow <- function(cols, keyColName, sep = ";"){

    doMatch <- function(col1id){
        if(is.null(matched)){
            matched <<- cbind(col1id, col2)
        }else{
            matched <<- rbind(matched, cbind(col1id, col2))
        }
    }

    matched <- NULL
    # Key can not be any of these
    if(cols[keyColName] == "" || is.na(cols[keyColName])||
       is.null(cols[keyColName])){
        return(NULL)
    }

    col1 <- unlist(strsplit(cols[1], sep), use.names = FALSE)
    col2 <- unlist(strsplit(cols[2], sep), use.names = FALSE)
    # If no mapping is found for key, assign NA to key
    if(match(keyColName, names(cols)) == 1 && length(col2) == 0){
        return(cbind(col1, NA))
    }else if(match(keyColName, names(cols)) == 2 && length(col1) == 0){
        return(cbind(NA, col2))
    }
    # Otherwise, do a full matching
    sapply(col1, doMatch)

    return(matched)
}
# This function writes to an XML file using data contained by a file
# passed either as a file or R object. The file of the R object needs
# to be convertable to a matrix.
#
# Copyright 2002 J. Zhang, all rights reserved.
#

fileToXML <- function (targetName, outName, inName, idColName, colNames,
                       multColNames, typeColNames,  multSep = ";",
                       typeSep = ";", fileSep = "\t", header = FALSE,
                       isFile = TRUE, organism = "human",
                       version = "1.0.0"){

    if(!header && isFile){
        if(is.null(colNames) || is.na(colNames) || colNames == ""){
            stop("Parameter colNames has to be provided!")
        }
    }
    if(isFile){
        fileRead <- as.matrix(read.table(file = inName, header =
                                         header, sep = fileSep,
                                         as.is = TRUE, quote = "",
                                         comment.char = ""))
    }else{
        fileRead <- as.matrix(inName)
    }

    if(colNames != "" && !is.null(colNames) && !is.na(colNames)){
       colnames(fileRead) <- colNames
    }else{
        colNames <- colnames(fileRead)
    }
    # Create the our file and write the header
    file.create(outName)
    writeXMLHeader(outName, colNames, targetName, version, organism)
    # Write the first line of the Data node
    write(x = "<AnnBuilder:Data>", file = outName, append = TRUE)
    # Make starting tags for the Entry nodes
    entries <- paste("<AnnBuilder:Entry id=\"", fileRead[,idColName],
                       "\" describ = \"", idColName, "\">", sep = "")
    # Include Item nodes to Entry nodes using data from all the single
    # value columns
    for(i in setdiff(colNames, c(idColName, multColNames, typeColNames))){
        items <- paste("\t<AnnBuilder:Item name=\"", i,
                       "\" value=\"",
                       .formatStr(as.character(fileRead[,i])),
                       "\" />", sep = "")
        entries <- paste(entries, items, sep = "\n")
    }
    # Process the columns with multiple values separated by multSep
    if(!is.null(multColNames) && !is.null(multColNames) &&
       multColNames != ""){
        for(i in multColNames){
            doMultValue <- function(multCol, sep, name){
                splited <- unlist(strsplit(multCol, sep))
                paste("\t<AnnBuilder:Item name=\"", name, "\" value=\"",
                      .formatStr(as.character(splited)), "\" />", sep = "",
                      collapse = "\n")
            }
            items <- lapply(as.vector(fileRead[,i]), doMultValue, multSep, i)
            entries <- paste(entries, items, sep = "\n")
        }
    }
    # Process the columns with a type attribute
    if(!is.null(typeColNames) && !is.null(typeColNames) &&
       typeColNames != ""){
        for(i in typeColNames){
            doTypeValue <- function(typeCol, sep, name){
                if(typeCol != "" && toupper(typeCol) != "NA" &&
                   !is.na(typeCol) && !is.null(typeCol)){
                    splited <- unlist(lapply(typeCol, twoStepSplit))
                    splited <- cbind(splited, names(splited))
                    paste("\t<AnnBuilder:Item name=\"", name, "\" value=\"",
                          .formatStr(as.character(splited[,1])), "\"",
                          paste(" type=\"", splited[,2], "\"", sep = ""),
                          "/>", sep = "", collapse = "\n")
                }
            }
            items <- lapply(fileRead[,i], doTypeValue, typeSep, i)
            entries <- paste(entries, items, sep = "\n")
        }
    }
    # Collapse entries
    entries <- paste(entries, sep = "",
                     collapse = "\n</AnnBuilder:Entry>\n")
    # Add the rest in
    entries <- paste(entries, "</AnnBuilder:Entry>\n",
                     "</AnnBuilder:Data>\n",
                     "</AnnBuilder:Annotate>\n", sep = "", collapse = "\n")
    #Write to file
    write(entries, file = outName, append = TRUE)

}

.formatStr <- function(tobeDone){
    tobeDone <- gsub("&", "&amp;", tobeDone)
    tobeDone <- gsub("<", "&lt;", tobeDone)
    tobeDone <- gsub(">", "&gt;", tobeDone)
    tobeDone <- gsub("\"", "&quot;", tobeDone)
    tobeDone <- gsub("'", "&apos;", tobeDone)
    return(tobeDone)
}


















# Functions that return the built date for a given public source.
#
# Copyright 2002, J. Zhang, all rights reserved
#

getSrcBuilt <- function(src = "LL", organism = "human"){
    switch(toupper(src),
           "LL" = return(getLLBuilt()),
           "UG" = return(getUGBuilt(organism)),
           "GP" = return(getUCSCBuilt(organism)),
           "GO" = return(getGOBuilt()),
           "KEGG" = return(getKEGGBuilt()),
           "YG" = return(getYGBuilt()),
           "HG" = return(getHGBuilt()),
           stop(paste("Source", src, "is not supported.")))
}

getLLBuilt <- function(
        url = "http://www.ncbi.nlm.nih.gov/LocusLink/statistics.html"){

    got <- readURL(url)
    built <- gsub(".*LocusLink Statistics dated (.*)",
                  "\\1", got[grep("LocusLink Statistics dated", got)])
    if(is.na(built) || is.null(built) || built == ""){
        waringing("Built for LL is not valid!")
        return("N/A")
    }else{
        return(built)
    }
}

getUGBuilt <- function(organism,
                       url = "ftp://ftp.ncbi.nih.gov/repository/UniGene"){

    switch(toupper(organism),
           "HUMAN" = infoUrl <- paste(url, "/Hs.info", sep = ""),
           "MOUSE" = infoUrl <- paste(url, "/Mm.info", sep = ""),
           "RAT" = infoUrl <- paste(url, "/Rn.info", sep = ""),
           stop(paste("Organism", organism, "is not supported.")))

    got <- readURL(infoUrl)
    built <- gsub(".*(Build #[0-9]*).*", "\\1",
                  got[grep("Build #", got)[1]])
    if(is.na(built) || is.null(built) || built == ""){
        waringing("Built for UG is not valid!")
        return("N/A")
    }else{
        return(built)
    }
}

getUCSCBuilt <- function(organism){
    url <- getSrcUrl(src = "GP", organism = organism)
    switch(toupper(organism),
           MOUSE =  built <- gsub(".*goldenPath/mm(.*)/database.*",
                           "\\1", url),
           HUMAN = built <- gsub(".*goldenPath/(.*)/database.*", "\\1", url),
           RAT = built <- gsub(".*goldenPath/rn(.*)/database.*",
                           "\\1", url),
           stop(paste("Organism", organism, "is not supported")))

    if(is.na(built) || is.null(built) || built == ""){
        waringing("Built for UCSC is not valid!")
        return("N/A")
    }else{
        return(built)
    }
}

getGOBuilt <- function(){
   return(getGOUrl(xml = TRUE, dateOnly = TRUE))
}

getKEGGBuilt <- function(url = "http://www.genome.ad.jp/kegg/kegg2.html"){

    got <- readURL(url)
    aLine <- got[grep("KEGG Release", got)]
    built <- gsub(".*(Release [0-9.]* \\(.*\\)) .*", "\\1", aLine)
    if(is.na(built) || is.null(built) || built == ""){
        waringing("Built for KEGG is not valid!")
        return("N/A")
    }else{
        return(built)
    }
}

getYGBuilt <- function(){
    return(paste("Yeast Genome data are built at various time ",
                 "intervals. Sources used were downloaded ", date(), "\n"))
}

readURL <- function(url){
    con <- url(url)
    options(show.error.messages = FALSE)
    temp <- try(readLines(con))
    close(con)
    options(show.error.messages = TRUE)
    if(!inherits(temp, "try-error")){
        return(temp)
    }else{
        stop(paste("Can't read from url:", url))
    }
}

getHGBuilt <- function(){
    return("HomoloGene built date not available")
}











# Functions that return the correct URL that is going to be used to
# get annotation data from the desirable public data source.
#
# Copyright 2002, J. Zhang, all rights reserved.
#

getSrcUrl <- function(src = "LL", organism = "human", xml = TRUE,
                      dateOnly = FALSE){
    load(file.path(.path.package("AnnBuilder"), "data",
                    "sourceURLs.rda"), .GlobalEnv)
    switch(toupper(src),
           "LL" = return(getLLUrl()),
           "GP" = return(getUCSCUrl(organism)),
           "UG" = return(getUGUrl(organism)),
           "GO" = return(getGOUrl(xml, dateOnly)),
           "KEGG" = return(getKEGGUrl()),
           "GEO" = return(getGEOUrl()),
           "YG" = return(getYGUrl()),
           "HG" = return(getHGUrl()),
           "ALL" = return(getAllUrl(organism)),
           stop(paste("Source", src, "is not supported.",
                      "Has to be LL, GP, UG, GO, KEGG, ALL")))
}

getAllUrl <- function(organism){
    urls <- c(LL = getLLUrl(), GP = getUCSCUrl(organism),
              UG = getUGUrl(organism), GO = getGOUrl(), KEGG = getKEGGUrl(),
              YG = getYGUrl(), HG = getHGUrl())
    return(urls)
}

getYGUrl <- function(){
    return("http://www.yeastgenome.org/DownloadContents.shtml")
}

getLLUrl <- function(){
    return(sourceURLs[["LL"]])
}

getUCSCUrl <- function(organism){
    switch(toupper(organism),
           "HUMAN" = key <- "Annotation database",
           "MOUSE" = key <- "goldenPath/mm.*Annotation database",
           "RAT" = key <- "goldenPath/rn.*Annotation database",
           stop (paste("Organism", organism, "is not supported.")))

    con <- url(sourceURLs[["GP"]], open = "r")
    htmlPage <- readLines(con)
    close(con)
    aLine <- htmlPage[grep(key, htmlPage)[1]]
    got <- gsub(".*(goldenPath/.*/database/).*", "\\1", aLine)
    return(paste(gsub("(.*/).*.html", "\\1", sourceURLs[["GP"]]),
                 got, sep = ""))
}

getUGUrl <- function(organism){
    switch(toupper(organism),
           "HUMAN" = return(paste(sourceURLs[["UG"]], "/Hs.data.gz",
                            sep = "")),
           "MOUSE" = return(paste(sourceURLs[["UG"]], "/Mm.data.gz",
                            sep = "")),
           "RAT" = return(paste(sourceURLs[["UG"]], "/Rn.data.gz", sep = "")),
           stop(paste("Organism", organism, "is not supported.")))
}

# Figures out the url for the latest version of the GO data file
getGOUrl <- function(xml = TRUE, dateOnly = FALSE){
    # Read the html page to get the available data for different builds
    con <- url(sourceURLs[["GO"]], open = "r")
    srcHtml <-  readLines(con)
    close(con)
    builds <- srcHtml[grep("^<IMG SRC=.*>[0-9]*-[0-9]*-[0-9]*/.*", srcHtml)]
    # Loop through the builds to get the latest one
    builds <- sort(gsub(".*>([0-9]*-[0-9]*-[0-9]*)/.*", "\\1",  builds),
                   decreasing = TRUE)
    index <- 1
    # Get the name of the available file in the latest build
    repeat{
        con <- url(paste(sourceURLs[["GO"]], "/", builds[index],
                         sep = ""), open = "r")
        files <- readLines(con)
        close(con)
        if(xml){
            fileName <- gsub(".*(go_[0-9]*-termdb.xml.gz).*",
                    "\\1", files[grep(".*termdb.xml.gz", files)])
        }else{
            fileName <- gsub(".*(go_[0-9]*-termdb-data.gz).*", "\\1",
                             files[grep(".*termdb-dat", files)])
        }
        if(length(fileName) == 0){
            if(index < length(builds)){
                index <- index + 1
            }else{
                stop(paste("The target data file termdb.xxx.gz is not",
                           "available at", paste(sourceURLs[["GO"]],
                                                 "/", latest, sep = "")))
            }
        }else{
            if(dateOnly){
                return(builds[index])
            }else{
                return(paste(sourceURLs[["GO"]], "/",
                             builds[index], "/",fileName, sep = ""))
            }
        }
    }
}

getKEGGUrl <- function(){
    return(sourceURLs[["KEGG"]])
}

getGEOUrl <- function(){
    return(sourceURLs[["GEO"]])
}

getHGUrl <- function(){
    return(sourceURLs[["HG"]])
}
# This function builds a data package that maps internal HomoloGene
# ids of an organism to LocusLink ids, UniGene ids, percent identity
# of the alignment, and type of similarities of organisms of all
# pairwise best matches based on data from
# "ftp://ftp.ncbi.nih.gov/pub/HomoloGene/hmlg.ftp"
# pkgName - a character string for the name of the data package to be
# created.
# pkgPath - a character string for the path to which the data package
# to be created will be stored.
# version - a character string for the version number of the data
# package to be created.
# author - a list of two elements with the firest one being author - a
# character string for the name of the author of the data package and
# second one being maintainer - a character string for the email of
# the author.
#

homoPkgBuilder <- function(pkgName = "homology", pkgPath, version, author,
                 url = getSrcUrl("HG")){

    homoData <- procHomoData(url)
    makeSrcInfo()
    createEmptyDPkg(pkgName, pkgPath)
    # Write man page templates
    writeManPage(pkgName, pkgPath, "#HGID2LL#", "",
                         src = get("#HGID2LL#", AnnInfo)$src)
    writeManPage(pkgName, pkgPath, "#HGID2GB#", "",
                         src = get("#HGID2GB#", AnnInfo)$src)
    writeManPage(pkgName, pkgPath, "#HGID2PS#", "",
                         src = get("#HGID2PS#", AnnInfo)$src)
#    writeManPage(pkgName, pkgPath, "#HGID2TS#", "",
#                         src = get("#HGID2TS#", AnnInfo)$src)
    writeManPage(pkgName, pkgPath, "#HGID2HGID#", "",
                         src = get("#HGID2HGID#", AnnInfo)$src)
    # Do HGID to LL
    writeRdaNMan(mapIntID(homoData[, c(1, 2, 4, 5, 7, 8)]), pkgName,
                 pkgPath, "HGID2LL")
    # Do HGID to GB
    writeRdaNMan(mapIntID(homoData[, c(1, 2, 6, 5, 9, 8)]), pkgName,
                 pkgPath, "HGID2GB")
    # Do HGID to PS
    mapPS(homoData[, c(1, 2, 3, 4, 5, 7, 8, 10)], pkgName, pkgPath)
    # Do HGID to HGID
    writeRdaNMan(mapIntID(homoData[, c(1, 2, 5, 5, 8, 8)]), pkgName,
                 pkgPath, "HGID2HGID")
    # Remove man page templates
    file.remove(file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#HGID2LL#.Rd", sep = "")),
                file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#HGID2GB#.Rd", sep = "")),
                file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#HGID2PS#.Rd", sep = "")),
                file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#HGID2HGID#.Rd", sep = "")))
    writeDescription(pkgName, pkgPath, version, author)
    # using pkgName() and the man page for the function
    writeFun (pkgPath, pkgName, organism = "")
    writeMan4Fun(pkgName, pkgPath, "hg")
    # Write .First.lib
    writeZZZ(pkgPath, pkgName)
    # Write the quality control data
    # Quality control
    getDPStats("", pkgName, pkgPath)
    writeMan4QC(pkgName, pkgPath)
}

procHomoData <- function(url = getSrcUrl("HG")){
    # scan() does not work for some reason
    homoData <- as.matrix(read.table(url, header = FALSE, sep = "|",
                           quote = "", as.is = TRUE, strip.white = TRUE,
                           comment.char = ""))
    # remove leading LL from LL ids
    homoData <- matrix(gsub("^LL\\.", "", homoData),
                       ncol = ncol(homoData), byrow = FALSE)
    # make "" NA
    homoData[homoData == ""] <- NA
    # remove leading white spaces
    homoData <- matrix(gsub(" *", "", homoData),
                       ncol = ncol(homoData), byrow = FALSE)
    return(homoData)
}

getLL2IntID <- function(homoData, organism = ""){
    switch(tolower(organism),
           human = org <- "9606",
           mouse = org <- "10090",
           rat = org <- "10116",
           org <- NULL)
    ll2ID <- homoData[, c(4, 5, 1)]
    ll2ID <- rbind(ll2ID, homoData[,c(7, 8, 2)])
    if(!is.null(org)){
        ll2ID <- ll2ID[ll2ID[,3] == org, ]
    }
    ll2ID <- ll2ID[, 1:2]
    # Drop entries without LL
    ll2ID <- ll2ID[ll2ID[, 1] != "NA", ]
    # Get duplicated LL
    if(nrow(ll2ID) > length(unique(ll2ID[,1]))){
        ll2ID <- mergeRowByKey(ll2ID)
    }
    colnames(ll2ID) <- c("LOCUSID", "HGID")

    return(ll2ID)
}

getIntIDMapping <- function(homoData){
    dataEnv <- new.env(hash = TRUE, parent = NULL)
    # Drop entries with empty Internal id or NA for Mapping
    homoData <- homoData[homoData[, 2] != "NA" & !is.na(homoData[,2]) &
                         homoData[, 3] != "NA" & !is.na(homoData[,3]), ]
    # Remove duplication
    homoData <- homoData[!duplicated(homoData),]
    # If there is only one mapping found
    if(is.null(nrow(homoData))){
        homoData <- matrix(homoData, nrow = 1)
    }
    homoData <- cbind(homoData[,2], paste(homoData[,3],
                                      mapOrgs(homoData[,1]), sep = "@"))
    homoData <- mergeRowByKey(homoData, 1)
    if(length(homoData) > 0){
        multiassign(homoData[,1], lapply(homoData[,2], twoStepSplit),
                    env = dataEnv)
    }
    return(dataEnv)
}

mapIntID <- function(homoData){
#    if(combName){
#         IDMapping <- homoData[, c(1, 2, 3, 5, 6, 8)]
#         IDMapping <- rbind(IDMapping, homoData[,c(2, 1, 3, 7, 4, 8)])
#         IDMapping <- cbind(IDMapping[,1],
#                          paste(IDMapping[,2], IDMapping[,5], IDMapping[,3],
#                                  sep = ";"), IDMapping[, c(4, 6)])
#         return(IDMapping)
#    }else{
         IDMapping <- homoData[, c(1, 2, 4, 5)]
         IDMapping <- rbind(IDMapping, homoData[,c(2, 1, 6, 3)])
         return(IDMapping)
#    }
}

mapOrgs <- function(vect){
    for(i in names(orgNameNCode)){
        vect <- gsub(paste("^", i, "$", sep = ""),
                     orgNameNCode[i], vect)
    }
    return(vect)
}

writeRdaNMan <- function(homoData, pkgName, pkgPath, what){
    load(file.path(.path.package("AnnBuilder"), "data", "orgNameNCode.rda"),
         env = .GlobalEnv)
    for(i in names(orgNameNCode)){
        tempList <- list()
        tempData <- homoData[homoData[,1] == i, ]
        if(nrow(tempData) > 0){
            assign(paste(pkgName, i, what, sep = ""),
                   getIntIDMapping(tempData[, 2:4]))
            save(list = paste(pkgName, i, what, sep = ""),
                 file =  file.path(pkgPath, pkgName, "data",
                 paste(pkgName, i, what, ".rda", sep = "")))
            tempList[[what]] <-  paste(i, what, sep = "")
            tempList[["ORGANISM"]] <- orgNameNCode[[i]]
            copySubstitute(file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#", what, "#.Rd", sep = "")),
                           file.path(pkgPath, pkgName,
                           "man", paste(pkgName, i, what, ".Rd",
                                        sep = "")), tempList, "#")
        }
    }
}

mapPS <- function(homoData, pkgName, pkgPath){
    psMapping <- homoData[, c(1, 2, 3, 5, 6, 8)]
    psMapping <- rbind(psMapping, homoData[,c(2, 1, 3, 7, 4, 8)])
    homoPSs <- apply(psMapping, 1, getHomoPS)

    for(i in names(orgNameNCode)){
        mergeByHGID <- function(){
            mergeOne <- function(hgid){
                PSs <- tempPSs[tempHGIDs == hgid & !is.na(tempHGIDs)]
                noDup[[hgid]] <<- PSs
            }
            noDup <- tempPSs[!duplicated(tempHGIDs) & !is.na(tempHGIDs)]
            names(noDup) <- tempHGIDs[!duplicated(tempHGIDs) &
                                      !is.na(tempHGIDs)]
            trash <- sapply(unique(tempHGIDs[duplicated(tempHGIDs) &
                                            !is.na(tempHGIDs)]), mergeOne)
            return(noDup)
        }
        tempHGIDs <- psMapping[psMapping[,1] == i , 4]
        tempPSs <- homoPSs[psMapping[,1] == i]

        homoPSList <- mergeByHGID()

        if(length(homoPSList) > 0){
            tempEnv <- new.env(hash = TRUE, parent = NULL)
            multiassign(names(homoPSList), homoPSList, tempEnv)
            assign(paste(pkgName, i, "HGID2PS", sep = ""), tempEnv)
            save(list = paste(pkgName, i, "HGID2PS", sep = ""),
                 file =  file.path(pkgPath, pkgName, "data",
                 paste(pkgName, i, "HGID2PS", ".rda", sep = "")))
            tempList[["HGID2PS"]] <-  paste(i, "HGID2PS", sep = "")
            tempList[["ORGANISM"]] <- orgNameNCode[[i]]
            copySubstitute(file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#HGID2PS#.Rd", sep = "")),
                           file.path(pkgPath, pkgName,
                                   "man", paste(pkgName, i, "HGID2PS.Rd",
                                              sep = "")), tempList, "#")
        }
    }
}

getHomoPS <- function(entries){
    if(length(entries) < 6){
        stop("Incorrect argument length. Must be at least 6")
    }
    if(any(entries[3] == c("B", "b"))){
        return(new("homoPS", psOrg = mapOrgs(entries[2]), psLL = entries[5],
                   psType = entries[3], ps = as.numeric(entries[6])))
    }else{
        return(new("homoPS", psOrg = mapOrgs(entries[2]), psLL = entries[5],
                   psType = entries[3], psURL = entries[6]))
    }
}
# A generic class that reads or downloads data from public data repositories.
# srcUrl -  the url for the cgi script that initiates a query against
#            the databse. The value at the time of coding is:
#            "http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?"
#
    setClass("pubRepo", representation(srcUrl = "character",
                                       parser = "character",
                                       baseFile = "character"))

    # Set the get methods
    if(!isGeneric("srcUrl")){
        setGeneric("srcUrl",
                   function(object) standardGeneric("srcUrl"))
    }
    setMethod("srcUrl", "pubRepo",
              function(object) object@srcUrl)

    # Define the replace methods
    if(!isGeneric("srcUrl<-")){
        setGeneric("srcUrl<-", function(object, value)
                   standardGeneric("srcUrl<-"))
    }
    setReplaceMethod("srcUrl", "pubRepo", function(object, value){
                  object@scrUrl <- value; object})
    if(!isGeneric("baseFile")){
        setGeneric("baseFile",
                   function(object) standardGeneric("baseFile"))
    }
    setMethod("baseFile", "pubRepo",
              function(object) object@baseFile)

    # Define the replace methods
    if(!isGeneric("baseFile<-")){
        setGeneric("baseFile<-", function(object, value)
                   standardGeneric("baseFile<-"))
    }
    setReplaceMethod("baseFile", "pubRepo", function(object, value){
                  object@baseFile <- value; object})
    if(!isGeneric("parser")){
        setGeneric("parser",
                   function(object) standardGeneric("parser"))
    }
    setMethod("parser", "pubRepo",
              function(object) object@parser)
    if(!isGeneric("parser<-")){
        setGeneric("parser<-", function(object, value)
                   standardGeneric("parser<-"))
    }
    setReplaceMethod("parser", "pubRepo", function(object, value){
                  object@parser <- value; object})
    # Defines functions
    if(!isGeneric("readData")){
        setGeneric("readData",
                   function(object, ...)
                   standardGeneric("readData"))
    }
    setMethod("readData", "pubRepo",
              function(object, ...){
                  conn <- url(srcUrl(object))
                  temp <- readLines(conn)
                  close(conn)
                  return(temp)})
    if(!isGeneric("downloadData")){
        setGeneric("downloadData",
                   function(object, dist)
                   standardGeneric("downloadData"))
    }
    setMethod("downloadData", "pubRepo",
              function(object, dist)
                  return(loadFromUrl(srcUrl(object), dist)))
    if(!isGeneric("parseData")){
        setGeneric("parseData", function(object, ...)
                   standardGeneric("parseData"))
    }
    setMethod("parseData", "pubRepo", function(object, sep = "\t",
                                               ncol = 2, fromWeb = TRUE,
                                               mergeKey = TRUE){
        if(fromWeb && .Platform$OS.type == "unix"){
            srcData <- downloadData(object, "")
        }else{
            srcData <- srcUrl(object)
        }
        tempOut <- file.path(.path.package("AnnBuilder"), "temp",
                             basename(tempfile("tempOut")))
        obtained <- matrix(scan(fileMuncher(tempOut, baseFile(object),
                                                srcData, parser(object)),
                               what = "", sep = sep, quote = "",
                               quiet = TRUE), ncol = ncol, byrow = TRUE)
        if(fromWeb && .Platform$OS.type == "unix"){
            unlink(srcData)
        }
        unlink(tempOut)
        if(nrow(obtained) <= 1 || !mergeKey){
            return(obtained)
        }else{
            return(mergeRowByKey(obtained))
        }})

# Sub class of pubRepo that reads/downloads data from GEO
# srcUrl -  the url for the cgi script that initiates a query against
#            the databse. The value at the time of coding is:
#            "http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?"
    setClass("GEO", contains = "pubRepo")
    # Redefines readData
    setMethod("readData", "GEO",
              function(object, GEOAccNum)
              return(queryGEO(object, GEOAccNum)))


# Sub class of pubRepo that reads/downloads data from yeast genomic
# data.
# srcUrl - the base url for the ftp site for downloading. The value at
#          the time of coding is:
#          "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/"
     setClass("YG", contains = "pubRepo")
     # Redefines readData
     setMethod("readData", "YG",
              function(object, extenName, cols2Keep, sep)
              return(getYeastData(srcUrl(object), extenName,
                                      cols2Keep, sep)))
# Sub class of pubRepo that processes GO data

    setClass("GO", contains = "pubRepo")
    setMethod("readData", "GO",
              function(object, xml = TRUE, fromWeb = TRUE){
                  if(fromWeb){
                      fileName <- loadFromUrl(srcUrl(object), "")
                  }else{
                      fileName <- srcUrl(object)
                  }
                  if(xml){
                      goData <- GOXMLParser(fileName)
                  }else{
                      goData <- readLines(fileName)
                  }
                  unlink(fileName)
                  return(goData)})

    setClass("LL", contains = "pubRepo")

    setClass("UG", contains = "pubRepo",
             representation(organism = "character"))
    if(!isGeneric("organism")){
        setGeneric("organism",
                   function(object) standardGeneric("organism"))
    }
    setMethod("organism", "UG",
              function(object) object@organism)
    if(!isGeneric("organism<-")){
        setGeneric("organism<-", function(object, value)
                   standardGeneric("organism<-"))
    }
    setReplaceMethod("organism", "UG", function(object, value){
                  object@organism <- value; object})

  setClass("KEGG", contains = "UG")
    # Defines specific functions
    if(!isGeneric("findIDNPath")){
        setGeneric("findIDNPath", function(object)
                   standardGeneric("findIDNPath"))
    }
    # Get a named vector that labels KEGG pathway names with pathway ids.
    setMethod("findIDNPath", "KEGG", function(object)
              return(getKEGGIDNName(srcUrl(object))))
    if(!isGeneric("mapLL2ECNPName")){
        setGeneric("mapLL2ECNPName", function(object)
                   standardGeneric("mapLL2ECNPName"))
    }
    setMethod("mapLL2ECNPName", "KEGG", function(object)
              return(getLLPathMap(srcUrl(object), findIDNPath(object),
                                  organism(object))))

    setClass("GP", contains = "UG")
    # Defines specific functions
    if(!isGeneric("getStrand")){
        setGeneric("getStrand", function(object, ...)
                   standardGeneric("getStrand"))
    }
    # Parse refLink and refGene data files to get chromosomal location data
    setMethod("getStrand", "GP", function(object, fromWeb = TRUE)
              return(getChroLocation(srcUrl(object), fromWeb = fromWeb)))

    ### homoPS objects are used by homoPkgBuilder to represent data
    #### for percent similarity
    setClass("homoPS", representation(psOrg = "character",
                                      psLL = "character",
                                      psType = "character",
                                      ps = "numeric",
                                      psURL = "character"))

    # Set the get methods
    if(!isGeneric("psOrg")){
        setGeneric("psOrg",
                   function(object) standardGeneric("psOrg"))
    }
    setMethod("psOrg", "homoPS",
              function(object) object@psOrg)
    if(!isGeneric("psLL")){
        setGeneric("psLL",
                   function(object) standardGeneric("psLL"))
    }
    setMethod("psLL", "homoPS",
              function(object) object@psLL)
    if(!isGeneric("psType")){
        setGeneric("psType",
                   function(object) standardGeneric("psType"))
    }
    setMethod("psType", "homoPS",
              function(object) object@psType)
    if(!isGeneric("ps")){
        setGeneric("ps",
                   function(object) standardGeneric("ps"))
    }
    setMethod("ps", "homoPS",
              function(object) object@ps)
    if(!isGeneric("psURL")){
        setGeneric("psURL",
                   function(object) standardGeneric("psURL"))
    }
    setMethod("psURL", "homoPS",
              function(object) object@psURL)
# This function tries to download a file from a given target source
# and and unzip the file if needed.
#
# Copyright 2002, J. Zhang. All rights reserved.
#
loadFromUrl <- function(srcUrl, destDir = ""){

    if(destDir == ""){
        destDir <- file.path(.path.package("AnnBuilder"), "temp")
    }

    fileName <- file.path(destDir, paste(basename(tempfile()),
                             gsub("*.*/(*.*)", "\\1", srcUrl), sep = ""))
    # Make sure writing is permited
    options(show.error.messages = FALSE)
    tryMe <- try(file(fileName, "w"))
    options(show.error.messages = TRUE)
    if(inherits(tryMe, "try-error")){
        stop(paste("You do not have write permission to",
                   "the direcory specified!"))
    }
    options(show.error.messages = FALSE)
    tryMe <- try(download.file(srcUrl, fileName,
                      method = "internal", quiet = TRUE))
    options(show.error.messages = TRUE)
    if(inherits(tryMe, "try-error")){
        stop(paste("URL", srcUrl, "is incorrect or the target",
                   "site is not responding!"))
    }

    # Unzip if necessary
    if(regexpr("*.gz", fileName) > 0){
        if(file.exists(gsub("(*.*).gz", "\\1", fileName)))
            unlink(gsub("(*.*).gz", "\\1", fileName))
        unzipFile(fileName, destDir, TRUE)
    }else{
        if(regexpr("*.zip", fileName) > 0){
            unzipFile(fileName, destDir, FALSE)
        }
    }

    return (gsub("(.gz)|(.zip)", "\\", fileName))
}

validateUrl <- function(srcUrl){
    options(show.error.messages = FALSE)
    con <- try(url(srcUrl, "r"))
    options(show.error.messages = TRUE)
    if(inherits(con, "try-error")){
        stop(paste("URL", srcUrl, "is incorrect or the target",
                   "site is not responding!"))
    }else{
        close(con)
    }
}

unzipFile <- function(fileName, where =
                      file.path(.path.package("AnnBuilder"), "data"),
                      isgz = FALSE){
    curDir <- getwd()
    setwd(where)
    if(.Platform$OS.type == "unix"){
        if(isgz){
            system(paste("gunzip", fileName))
        }else{
            if(!is.null(getOption("unzip"))){
                system(paste(getOption("unzip"), basename(fileName)))
            }else{
                stop("Can not find unzip in the path")
            }
        }
    }else if(.Platform$OS.type == "windows"){
        if(isgz){
            con <- gzfile(fileName, open = "r")
            options(show.error.messages = FALSE)
            data <- try(readLines(fileName, n = 100))
            close(con)
            options(show.error.messages = TRUE)
            if(!inherits(data, "try-error")){
                write(data, file = gsub("\\.gz", "", fileName))
            }else{
                stop(paste("Error reading", fileName))
            }
        }else{
            zip.unpack(fileName, getwd())
        }
        unlink(fileName)
    }
    setwd(curDir)
}












# This function creates an R environment in .GlobalEnv to hold source
# data information that will be used later by other functions. The
# environment object contains key-value pairs with
# keys being data names such as LOCUSID, ACCNUM and values being a
# list containing a short (sort diescription), long (long
# description), and source (data source) element.
#
# Copyright 2002 J. Zhang, all right reserved.
#

makeSrcInfo <- function(srcFile = ""){
    path <- .path.package("AnnBuilder")

    if(is.null(srcFile) || is.na(srcFile) || srcFile == ""){
        srcFile <- file.path(path, "data", "AnnInfo")
    }

    info <- matrix(scan(srcFile, what = "", sep = "\t", quiet = TRUE),
                   ncol = 5, byrow = TRUE)
    temp<- new.env(hash = TRUE, parent = NULL)
    for(i in 1:nrow(info)){
        assign(info[i,][1], list(short = info[i,][2],
                                 long = info[i,][3],
                                 src = info[i,][4],
                                 pbased = info[i,][5]),
               env = temp)
    }
    assign("AnnInfo", temp, env = .GlobalEnv)
}

getAllSrc <- function(){
    return(c("LL", "UG", "GO", "KEGG", "GP", "YG", "GEO"))
}
map2LL <- function(pkgPath, organism, version, author,
              url = "ftp://ftp.ncbi.nih.gov/refseq/LocusLink/"){
    # Write AnnInfo to .GlobalEnv to make Annotation information available
    makeSrcInfo()
    pkgName <- paste(organism, "LLMappings", sep = "")
    # Create a data package with no data
    createEmptyDPkg(pkgName, pkgPath, force = TRUE)
    # Write the template for man pages
    writeManPage(pkgName, pkgPath, "#REPLACEME#", "",
                         src = get("#REPLACEME#", AnnInfo)$src)
    # Read data with mappings between LL and UG
    ug <-  read.table(paste(url, getExten("ug"), sep = ""), sep = "\t",
                        header = FALSE, as.is = TRUE)
    # Retain values for the organism
    ug <- ug[grep(getOrgName(organism, "short"), ug[,2]), ]
    ll2UG <- mergeRowByKey(ug, keyCol = 1)
    saveData2Env(ll2UG, splitEntry, pkgName, pkgPath, "LL2UG")
    copySubstitute(file.path(pkgPath, pkgName, "man",
                              paste(pkgName, "#REPLACEME#.Rd", sep = "")),
                           file.path(pkgPath, pkgName,
                           "man", paste(pkgName, "LL2UG.Rd", sep = "")),
                   list(REPLACEME = "LL2UG", KEY = "LocusLink id",
                        VALUE = "UniGene id",
                        OTHERS = ""), "#")
    ug2LL <- mergeRowByKey(ug, keyCol = 2)
    saveData2Env(ug2LL, splitEntry, pkgName, pkgPath, "UG2LL")
    copySubstitute(file.path(pkgPath, pkgName, "man",
                              paste(pkgName, "#REPLACEME#.Rd", sep = "")),
                           file.path(pkgPath, pkgName,
                           "man", paste(pkgName, "UG2LL.Rd", sep = "")),
                   list(REPLACEME = "UG2LL", VALUE = "LocusLink id",
                        KEY = "UniGene id",
                        OTHERS = ""), "#")
    # Read data with mappings between LL and GO. This is only good for
    # mappings with no evidence code
    # go <- read.table(paste(url, getExten("go"), sep = ""), sep = "\t",
    #                    header = FALSE, as.is = TRUE)
    # Retain those for the organism
    # go <- go[is.element(go[,1], ug[,1]),]
    # Retain those for the organism
    # ll2GO <- mergeRowByKey(go, keyCol = 1)
    # saveData2Env(ll2GO, splitEntry, pkgName, pkgPath, "LL2GO")
    # copySubstitute(file.path(pkgPath, pkgName, "man",
    #                       paste(pkgName, "#REPLACEME#.Rd", sep = "")),
    #                       file.path(pkgPath, pkgName,
    #                       "man", paste(pkgName, "LL2GO.Rd", sep = "")),
    #               list(REPLACEME = "LL2GO", KEY = "LocusLink id",
    #                    VALUE = "Gene Ontology id"), "#")
    # go2LL <- mergeRowByKey(go, keyCol = 2)
    # saveData2Env(go2LL, splitEntry, pkgName, pkgPath, "GO2LL")
    # copySubstitute(file.path(pkgPath, pkgName, "man",
    #                       paste(pkgName, "#REPLACEME#.Rd", sep = "")),
    #                       file.path(pkgPath, pkgName,
    #                       "man", paste(pkgName, "GO2LL.Rd", sep = "")),
    #               list(REPLACEME = "GO2LL", VALUE = "LocusLink id",
    #               KEY = "Gene Ontology id"), "#")

    # Parse LL_tmpl data to get the mappings between LL and PMID
    oriParser <- file.path(.path.package("AnnBuilder"), "data",
                            "llMappingParser")
    newParser <- file.path(.path.package("AnnBuilder"), "temp",
                           "llMappingParser")
    # Write organism to the parser to be used
    copySubstitute(oriParser, newParser,
                   list(REPLACEME = getOrgName(organism, "scientific")), "#")
    ll <- LL(srcUrl = paste(url, getExten("ll"), sep = ""),
                       parser = newParser)
    # llMapping contains three columns - locusid, pmid, and go
    options(show.error.messages = FALSE)
    llMapping <- try(parseData(ll, ncol = 3, fromWeb = TRUE))
    options(show.error.messages = TRUE)
    if(inherits(llMapping, "try-error")){
        stop(paste("Failed to get or parse LocusLink data becaus of:\n\n",
                   llMapping))
    }
    # Do pmid first
    saveData2Env(llMapping[, c(1, 2)], pkgName = pkgName,
                 pkgPath = pkgPath, envName = "LL2PMID")
    copySubstitute(file.path(pkgPath, pkgName, "man",
                             paste(pkgName, "#REPLACEME#.Rd", sep = "")),
                   file.path(pkgPath, pkgName, "man",
                             paste(pkgName, "LL2PMID.Rd", sep = "")),
                   list(REPLACEME = "LL2PMID",
                        KEY = "LocusLink id", VALUE = "PubMed id",
                        OTHERS = ""), "#")
    # Now do the reverse mapping for PMID
    pmid2LL <- getReverseMapping(llMapping[, c(1,2)])
    saveData2Env(pmid2LL, pkgName = pkgName,
                         pkgPath = pkgPath, envName = "PMID2LL")
    copySubstitute(file.path(pkgPath, pkgName, "man",
                             paste(pkgName, "#REPLACEME#.Rd", sep = "")),
                   file.path(pkgPath, pkgName, "man",
                             paste(pkgName, "PMID2LL.Rd", sep = "")),
                   list(REPLACEME = "PMID2LL",
                        KEY = "PubMed id", VALUE = "LocusLink id",
                        OTHERS = ""), "#")
    # Do GO ids
    saveData2Env(llMapping[, c(1, 3)], pkgName = pkgName, fun = twoStepSplit,
                 pkgPath = pkgPath, envName = "LL2GO")
    other <- paste(" Names of values are evidence code. Detailed ",
                   "descriptions of evidence code are available at ",
                   "\\\\url{www.geneontology.org/doc/GO.evidence.html}.",
                   sep  = "")
    copySubstitute(file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#REPLACEME#.Rd", sep = "")),
                           file.path(pkgPath, pkgName,
                           "man", paste(pkgName, "LL2GO.Rd", sep = "")),
                   list(REPLACEME = "LL2GO", KEY = "LocusLink id",
                        VALUE = "Gene Ontology id",
                        OTHERS = other), "#")
    # Do the reverse mapping for go
    go2LL <- reverseMap4GO(llMapping[, c(1, 3)], sep = ";")
    saveData2Env(go2LL, twoStepSplit, pkgName, pkgPath, "GO2LL")
    copySubstitute(file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#REPLACEME#.Rd", sep = "")),
                           file.path(pkgPath, pkgName,
                           "man", paste(pkgName, "GO2LL.Rd", sep = "")),
                   list(REPLACEME = "GO2LL", VALUE = "LocusLink id",
                        KEY = "Gene Ontology id",
                        OTHERS = other), "#")
    # Write man pages and so on
    writeDescription(pkgName, pkgPath, version, author, dataSrc = "NCBI")
    writeFun (pkgPath, pkgName, organism = "")
    writeMan4Fun(pkgName, pkgPath)
    # Write .First.lib
    writeZZZ(pkgPath, pkgName)
    # Write the quality control data
    # Quality control
    getDPStats("", pkgName, pkgPath)
    writeMan4QC(pkgName, pkgPath)
    file.remove(file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#REPLACEME#.Rd", sep = "")))

    return(invisible())
}

getUrl4Org <- function(organism){
    switch(toupper(organism),
           HUMAN = return(),
           MOUSE = return(),
           RAT = return(),
           stop("Organism not supported"))
}

getOrgName <- function(organism, what = c("scientific", "short")){
    what <- tolower(match.arg(what))
    names <- cbind(c("human", "mouse", "rat", "fly"),
                   c("Homo sapiens", "Mus musculus", "Rattus norvegicus",
                     "Drosophila melanogaster"), c("Hs", "Mm", "Rn", "Dm"))
    if(what == "scientific"){
        return(names[names[,1] == organism, 2])
    }else{
        return(names[names[,1] == organism, 3])
    }
}

getFullIDName <- function(ID){
    switch(toupper(ID),
           LL = return("LocusLink id"),
           UG = return("UniGene id"),
           GO = return("Gene Ontology id"),
           PMID = return("PubMid id"),
           stop("Unknown id name"))
}

# Just do not what to throw the code away
.keepAround <- function(){
    # Read data with mappings between LL and GG
    go <- read.table(paste(url, getExten("go"), sep = ""), sep = "\t",
                        header = FALSE, as.is = TRUE)
    ll2GO <- mergeRowByKey(go, keyCol = 1)
    saveColSepData(ll2GO, pkgName, pkgPath, "LL2GO")
    copySubstitute(file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#REPLACEME#.Rd", sep = "")),
                           file.path(pkgPath, pkgName,
                           "man", paste(pkgName, "LL2GO.Rd", sep = "")),
                   list(REPLACEME = "LL2GO", KEY = "LocusLink id",
                        VALUE = "Gene Ontology id"), "#")
    go2LL <- mergeRowByKey(go, keyCol = 2)
    saveColSepData(go2LL, pkgName, pkgPath, "GO2LL")
    copySubstitute(file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#REPLACEME#.Rd", sep = "")),
                           file.path(pkgPath, pkgName,
                           "man", paste(pkgName, "GO2LL.Rd", sep = "")),
                   list(REPLACEME = "GO2LL", VALUE = "LocusLink id",
                        KEY = "Gene Ontology id"), "#")
    # Read data with mappings between LL and RefSeq
    ref <-  read.table(paste(url, getExten("ref"), sep = ""), sep = "\t",
                        header = FALSE, as.is = TRUE)
    ll2Ref <- mergeRowByKey(cbind(ref[,1], gsub("(^.*)\\..*", "\\1",
                                                   ref[,2])), keyCol = 1)
    saveColSepData(ll2Ref, pkgName, pkgPath, "LL2REF")
    copySubstitute(file.path(pkgPath, pkgName, "man",
                              paste(pkgName, "#REPLACEME#.Rd", sep = "")),
                           file.path(pkgPath, pkgName,
                           "man", paste(pkgName, "LL2REF.Rd", sep = "")),
                   list(REPLACEME = "LL2REF", KEY = "LocusLink id",
                        VALUE = "RefSeq id"), "#")
    ref2LL <- mergeRowByKey(cbind(ref[,1], gsub("(^.*)\\..*", "\\1",
                                                   ref[,2])), keyCol = 2)
    saveColSepData(ref2LL, pkgName, pkgPath, "REF2LL")
    copySubstitute(file.path(pkgPath, pkgName, "man",
                              paste(pkgName, "#REPLACEME#.Rd", sep = "")),
                           file.path(pkgPath, pkgName,
                           "man", paste(pkgName, "REF2LL.Rd", sep = "")),
                   list(REPLACEME = "REF2LL", VALUE = "LocusLink id",
                        KEY = "RefSeq id"), "#")
    # Read data with mappings between LL and UG
    ug <-  read.table(paste(url, getExten("ug"), sep = ""), sep = "\t",
                        header = FALSE, as.is = TRUE)
    ll2UG <- mergeRowByKey(ug, keyCol = 1)
    saveColSepData(ll2UG, pkgName, pkgPath, "LL2UG")
    copySubstitute(file.path(pkgPath, pkgName, "man",
                              paste(pkgName, "#REPLACEME#.Rd", sep = "")),
                           file.path(pkgPath, pkgName,
                           "man", paste(pkgName, "LL2UG.Rd", sep = "")),
                   list(REPLACEME = "LL2UG", KEY = "LocusLink id",
                        VALUE = "UniGene id"), "#")
    ug2LL <- mergeRowByKey(ug, keyCol = 2)
    saveColSepData(ug2LL, pkgName, pkgPath, "UG2LL")
    copySubstitute(file.path(pkgPath, pkgName, "man",
                              paste(pkgName, "#REPLACEME#.Rd", sep = "")),
                           file.path(pkgPath, pkgName,
                           "man", paste(pkgName, "UG2LL.Rd", sep = "")),
                   list(REPLACEME = "UG2LL", VALUE = "LocusLink id",
                        KEY = "UniGene id"), "#")
    # Write man pages and so on
    writeDescription(pkgName, pkgPath, version, author)
    writeFun (pkgPath, pkgName, organism = "")
    writeMan4Fun(pkgName, pkgPath)
    # Write .First.lib
    writeZZZ(pkgPath, pkgName)
    # Write the quality control data
    # Quality control
    getDPStats("", pkgName, pkgPath)
    writeMan4QC(pkgName, pkgPath)
    file.remove(file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "#REPLACEME#.Rd", sep = "")))
}

getExten <- function(what){

    switch(toupper(what),
           "GO" = return("loc2go"),
           "REF" = return("loc2ref"),
           "UG" = return("loc2UG"),
           "LL" = return("LL_tmpl.gz"),
           stop("Source type not supported"))
}

saveData2Env <- function(data, fun = splitEntry, pkgName, pkgPath, envName){
    env <- new.env(hash = TRUE, parent = NULL)
    if(nrow(data) != 0){
        multiassign(data[,1], lapply(data[,2], fun), env)
    }
    assign(paste(pkgName, envName, sep = ""), env)
    save(list = paste(pkgName, envName, sep = ""),
         file = file.path(pkgPath, pkgName, "data",
         paste(pkgName, envName, ".rda", sep = "")))
}


getReverseMapping <- function(data, sep = ";"){
    if(nrow(data) == 0){
        return(data)
    }
    seped <- sapply(data[,2], splitEntry)
    reps <- sapply(seped, length)
    data <- cbind(rep(data[,1], reps), unlist(seped, use.names = FALSE))
    data <- mergeRowByKey(data, keyCol = 2)
    return(data)
}

reverseMap4GO <- function(data, sep = ";"){
    appendEC <- function(entry){
        ids <- unlist(strsplit(entry[2], ";"))
        return(paste(paste(ids, gsub("GO:[0-9]*", "", entry[1]),
                           sep = ""), sep = "", collapse = ";"))
    }
    if(nrow(data) == 0){
        return(data)
    }
    # Get reverse mappings first
    reversed <- getReverseMapping(data, sep)
    # Append "@evidence code" to the end of each mapped values
    reversed[,2] <- apply(reversed, 1, appendEC)

    # Remove evidence code from GO ids
    reversed[,1] <- gsub("@.*", "", reversed[,1])

    return(mergeRowByKey(reversed, keyCol = 1))
}
mapGO2AllProbe <- function(go2Probe, goData, goid = "", sep = ";",
                           all = TRUE){
    if(all){
        # If all, get all the unigue go ids
        goid <- unique(goData[,1])
    }
    # Gets all the probes
    getAllProbe <- function(id){
        temp <- getOffspringNodes(id, goData)
        found <- unique(unlist(strsplit(go2Probe[is.element(go2Probe[, 1],
                              temp), 2], sep), use.names = FALSE))
        # Add mapping by the id itself in
        found <- unique(c(found,
                        unlist(strsplit(go2Probe[go2Probe[,1] == id, 2],
                                        sep), use.names = FALSE)))
        if(is.null(found) || is.na(found) || found == "" ||
           found == "NA"){
            return(NA)
        }else{
            # Collapse the values
            return(paste(found, sep = "", collapse = sep))
        }
    }
    go2All <- sapply(goid, getAllProbe)
#    go2All <- cbind(names(go2All), go2All)
#    colnames(go2All) <- c("GO", "GO2ALLPROBE")
    return(go2All)
}
# This function resolves the one to may relationship beween ids
# contained in a file in one column and the values in another
# column. Duplicating values for the same id will be merged as one
# value separated by ";".
#
# Copyright 2002, J. Zhang. All rights reserved.
#

mergeRowByKey <- function (mergeMe, keyCol = 1, sep = ";"){
    mergeTwoCol <- function(x){
        return(paste(unique(x[, -keyCol]), sep = "", collapse = ";"))
    }
    mergeMultiCol <- function(x){
        return(apply(x, 2, function(y)
                     paste(unique(y), sep = "", collapse = ";")))
    }
    # Returns the original value if mergeMe is a vector
    if(is.null(ncol(mergeMe))){
        return(mergeMe)
    }
    merged <- split.data.frame(mergeMe, factor(mergeMe[, keyCol]))
    if(ncol(mergeMe) == 2){
        merged <- sapply(merged, mergeTwoCol)
        merged <- cbind(names(merged), merged)
        colnames(merged) <- c(colnames(mergeMe)[keyCol],
                              colnames(mergeMe)[-keyCol])
        return(merged)

    }else{
        merged <- sapply(merged, mergeMultiCol)
        merged <- t(merged)
        colnames(merged) <- colnames(mergeMe)
        return(merged)
    }
}




























# This function prints the quality control results for a givan data package.
#
# Copyright 2002, J. Zhang. All rights reserved.
#

print.ABQCList <- function(x, ...){
    cat("\n",
        paste("\nQuality control information for ",
              x$name, ":\n", sep = ""),
        paste("\nDate built:", x$built),
        ifelse(is.null(x$probeNum), "",
        paste("\nNumber of probes:", x$probeNum)),
        ifelse(is.null(x$numMissMatch), "",
        paste("\nProbe number missmatch:", paste(x$numMissMatch,
              sep = "", collapse = "; "))),
        ifelse(is.null(x$probeMissMatch), "",
        paste("\nProbe missmatch:", paste(x$probeMissMatch,
              sep = "", collapse = "; "))),
        ifelse(is.null(x$probeMapped), "", paste(
        "\nMappings found for probe based rda files: \n",
        paste("\t", names(x$probeMapped), "found", x$probeMapped,
                    "of", x$probeNum, sep = " ", collapse = "\n"),
                                   sep = "", collapse ="\n")),
        "\nMappings found for non-probe based rda files:\n",
        paste("\t", names(x$otherMapped), "found", x$otherMapped,
              sep = " ", collapse = "\n"),"\n\n")
}

# Constructors an object of pubRepo
#

pubRepo <- function(srcUrl, parser = "", baseFile = ""){
    new("pubRepo", srcUrl = srcUrl, parser = parser, baseFile = baseFile)
}

# This function unifies mappings obtained from different source and
# returns a set of unified mappings based on user's specification.
#
# Copyrihgt 2002, J. Zhnag. All rights reserved.
#

resolveMaps <- function(maps, trusted, srcs, colNames = NULL,
                        outName = "", asFile = TRUE){

    doOneRow <- function(vect){
        temp <- vect[!is.null(vect)]
        temp <- temp[!is.na(temp)]
        temp <- temp[toupper(temp) != "NA"]
        if(length(temp) == 0){
            vote <- c(NA, length(vect))
        }
        if(any(!is.na(temp[trusted]))){
            # Get vote from trusted
            vote <- getVote(temp[trusted])
        }else{
            if(any(!is.na(temp[srcs]))){
                # Get vote from sources when there is no vote from trusted
                vote <- getVote(temp[srcs])
            }else{
                # No mapping at all
                vote <- c(NA, length(c(trusted, srcs)))
            }
        }
#        return(vote)

        return(c(vect[setdiff(names(vect), c(trusted, srcs))],
                 vote[1], vote[2]))
    }

    if(!is.null(colNames)){
        colnames(maps) <- colNames
    }
    if(any(!is.element(trusted, colnames(maps)))){
        stop("trusted must be one of the colNames")
    }
    if(asFile && outName == ""){
        outName <- file.path(.path.package("AnnBuilder"), "temp",
                          basename(tempfile("tempFile")))
    }
#    temp <- apply(maps[c(trusted, srcs)], 1, doOneRow)
#    unified <- cbind(maps[setdiff(names(maps), c(trusted, srcs))], t(temp))
#    if(asFile){
#        write.table(x = unified, file = outName,
#                    quote = FALSE, sep = "\t", row.names = FALSE,
#                    col.names = FALSE)
#        return(outName)
#    }else{
#        return(t(unified))
#    }
    temp <- apply(maps, 1, doOneRow)
    if(asFile){
        write.table(x = t(temp), file = outName,
                    quote = FALSE, sep = "\t", row.names = FALSE,
                    col.names = FALSE)
        return(outName)
    }else{
        return(t(temp))
    }
}

# Finds agreement among a vector containing different sources
getUnified <- function(voters){
    # Find the number of repeatition each element occurs in a vector
    findRep <- function(toFind){
        counts <- length(voters[voters == toFind])
    }
    findSmallest <- function(vec){
        options(warn = -1)
        temp <- try(as.numeric(vec))
        options(warn = 0)
        if(any(is.na(temp))){
            return(sort(vec)[1])
        }else{
            return(sort(temp)[1])
        }
    }

    repeatition <- sapply(voters, findRep)
    majority <- max(repeatition, na.rm = TRUE)
    # If majority = 1, no agreement among sources. Get a mapping by
    # one of the sources
    if(majority == 1){
        return(c(getNoDup(voters), 1))
    }
    # majority <- max(repeatition, na.rm = TRUE)
    found <- c(unique(voters[repeatition == majority]))
    # Only one majority vote
    if(length(found) == 1){
        return(c(found, majority))
    }else{
        return(c(findSmallest(found), majority))
    }
}

# When sources do no agree, get one based on the rules defined here
getNoDup<- function(voters){
    if(length(voters) <= 1){
        return(voters)
    #}else if(length(voters) == 1){
     #   return(voters)
    }else{
        temp <- voters[1]
        # Find the mapping that has the smallest number of set of characters
        for(i in 2:length(voters)){
            if(temp > voters[i]){
                temp <- voters[i]
            }
        }
        return(temp)
    }
}

hasDelimit <- function(entry, deli = ";"){
    if(any(is.null(entry), is.na(entry))){
        return(FALSE)
    }
    if(gsub(paste(".*(", deli, ").*", sep = ""), "\\1",
                                  as.character(entry)) == ";"){
        return(TRUE)
    }else{
        return(FALSE)
    }
}

getVote <- function(voters, sep = ";"){
    entries <- unlist(sapply(voters, strsplit, sep))
    entries <- entries[!is.null(entries)]
    entries <- entries[!is.na(entries)]
    entries <- entries[toupper(entries) != "NA"]
    if(!any(duplicated(entries))){
        # No argeement
        return(c(getNoDup(entries), 1))
    }else{
        # With agreements
        return(getUnified(entries))
    }
}









# This function writes all the elements other than rda files required
# for a data package
writeAccessory <- function(pkgName, pkgPath, organism, version,
                         author = list(author = "who",
                         maintainer = "who@email.net"),
                         dataSrc = "various public data repositories",
                         license = "LGPL"){
    # Write to DESCRIPTION
    writeDescription(pkgName, pkgPath, version, author, dataSrc, license)
    # Write Rd files for all the rda files that already exist in data
    for(i in getAllRdaName(pkgName, pkgPath)){
        writeManPage(pkgName, pkgPath, i, organism,
                     src = get(i, AnnInfo)$src)
    }
    # Write a function that displays data information when called
    # using pkgName() and the man page for the function
    writeFun (pkgPath, pkgName, organism = organism)
    writeMan4Fun(pkgName, pkgPath, organism)
    # Write .First.lib
    writeZZZ(pkgPath, pkgName)
    # Put the pretty print function for QC data and man page in
#    putPrettyPrint(pkgName, pkgPath)
}

# Functions that write the documentation files for a data package.
#
# Copyright 2002, J. Zhang, all rights reserved.
#

writeManPage <- function(pkgName, pkgPath, manName, organism = "human",
                         src = "ll", isEnv = TRUE){

    path <- file.path(pkgPath, pkgName, "man")
    filename <- file.path(path, paste(pkgName, formatName(manName),
                                    ".Rd", sep = ""))
    if(is.na(src)){
        srcBuild <- "N/A"
    }else{
        if(toupper(organism) == "YEAST"){
            srcBuild <- getBuild4Yeast(src, manName)
        }else{
            srcBuild <- getSrcBuiltNRef(src, organism)
        }
    }
    write(paste("\\name{", paste(pkgName, manName, sep = ""), "}\n",
                "\\alias{", paste(pkgName, manName, sep = ""), "}\n",
                "\\title{", "An annotation data file for ", manName,
                " in the ", pkgName, " package}\n",
                "\\description{\n", get(manName, AnnInfo)$long, "\n}\n",
                srcBuild, getExample(pkgName, manName, isEnv),
                "\\keyword{datasets}\n", sep = ""), filename)
}

getExample <- function(pkgName, manName, isEnv = TRUE){
    if(isEnv){
        return(paste("\\examples{\n", paste(paste("xx <- ls(env = ",
                           paste(pkgName, formatName(manName), sep = ""),
                           ")", sep = ""), "if(length(xx) > 0){",
                                            paste("get(xx[1], env = ",
                           paste(pkgName, formatName(manName), ")",
                                 sep = ""), "}", sep = ""), sep = "\n"),
                           "\n}\n", sep = ""))
    }else{
        return(paste("\\examples{\n",
               paste(pkgName, manName, sep = ""), "\n}\n", sep = ""))
    }
}

getSrcBuiltNRef <- function(src, organism){
    switch(toupper(src),
           "LL" = dataSource <- "LocusLink",
           "UG" = dataSource <- "UniGene",
           "KEGG" = dataSource <- "KEGG",
           "GO" = dataSource <- "Gene Ontology",
           "GP" = dataSource <- "Human Genome Project",
           "YG" = dataSrurce <- "Yeast Genome Project",
           "HG" = dataSource <- "HomoloGene",
           dataSource <- src)

    if(src == ""){
        return(paste("\\details{\n",
               paste(paste("Package built:", date())), "\n}\n", sep = ""))
    }else{
        return(paste("\\details{\n", paste("Mappings were based on data ",
                                           "provided by ", dataSource,
                                           "\n\n", sep = ""),
                     paste(paste("Source data built:",
                                 getSrcNBuilt(src, organism), "\n",
                                 paste("Package built:", date()),
                                 sep = "")), "\n}\n",
                     "\\references{\n", paste("\\url{", getSrcUrl(src), "}",
                                              sep = ""), "\n}\n", sep = ""))
    }
}

# This function writes the man page for the function that displays
# data information when invoked by typing pkgName()
writeMan4Fun <- function(pkgName, pkgPath, organism = "human", QCList){
    fileName <- file.path(pkgPath, pkgName, "man",
                          paste(pkgName, ".Rd", sep = ""))
    dSrc <- getDSrc(organism)
    item <- getSrcNBuilt(dSrc, organism)
    item <- paste(item, sep = "\n")
    write(paste("\\name{", pkgName, "}\n",
                "\\alias{", pkgName, "}\n",
                "\\title{",
                "Genomic Annotation data package built with AnnBuilder}\n",
                "\\description{\nThe package is built using a ",
                "downloadable R package - AnnBuilder (download and build ",
                "your own) from www.bioconductor.org using the ",
                "following public data sources:\n", item,
                "\n\nThe function ", pkgName, "() provides ",
                "information about the binary data files}\n",
                "\\keyword{datasets}\n", sep = ""), fileName)

}

escapeLatexChr <- function(item){
    item <- gsub("_", "\\\\_", item)
    item <- gsub("#", "\\\\#", item)
    item <- gsub("&", "\\\\&", item)
    item <- gsub("%", "\\\\%", item)
    item <- gsub("{", "\\\\{", item)
    item <- gsub("}", "\\\\}", item)
    return(item)
}

getSrcNBuilt <- function(dSrc, organism){
    items <- ""
    for(i in dSrc){
        switch(toupper(i),
               "LL" = toAdd <- paste("LocusLink",
                                      getUrlNBuilt("LL", organism)),
               "GP" = toAdd <- paste("Human Genome Project",
                                 getUrlNBuilt("GP", organism)),
               "UG" = toAdd <- paste("UniGene",
                                      getUrlNBuilt("UG", organism)),
               "GO" = toAdd <- paste("Gene Ontology Consortium",
                                      getUrlNBuilt("GO", organism)),
               "KEGG" = toAdd <- paste("KEGG",
                                      getUrlNBuilt("KEGG", organism)),
               "YG" = toAdd <- paste("Yeast Genome",
                                      getUrlNBuilt("YG", organism)),
               "HG" = toAdd <- paste("Homology",
                                      getUrlNBuilt("HG"), organism),
               warning(paste("Source", i, "is not a valid name!")))
        items <- paste(items, "\n", toAdd, sep = "")
    }
    return(items)
}

getUrlNBuilt <- function(src, organism){
    paste("built: ", escapeLatexChr(getSrcBuilt(src, organism)), ".",
          "\\url{", escapeLatexChr(getSrcUrl(src, organism)), "}.", sep = "")
}

#getItem <- function(name, content){
#    paste("\\item{", name, "}{",content, "}", sep = "")
#}

formatName <- function (toFormat){
    gsub("*(_)*", "", toFormat)
}

writeREADME <- function(pkgPath, pkgName, urls){

    OPENING <- paste("The binary data files in this package were constructed",
                     "using data from the following sources identified",
                     "by their URLs:")
    CLOSING <- paste("We highly appreciate their efforts of making",
                     "the data available publically")


    fileName <- file.path(pkgPath, pkgName, "README")

    write(OPENING, file = fileName, append = FALSE)
    for(i in urls)
        write(i, file = fileName, append = TRUE)
    write(CLOSING, file = fileName, append = TRUE)
}

#writeIndex <- function(path, pkgname){
#    indexPath <- file.path(path, pkgname)
#    system(paste("cd", indexPath))
#    system("R CMD Rdindex man >INDEX")
#    system(paste("cd", getwd()))
#}

writeDescription <- function(pkgName, pkgPath, version, author,
                             dataSrc = "various public data repositories",
                             license = "LGPL"){

    path <- file.path(pkgPath, pkgName)
    fileName <- file.path(path, "DESCRIPTION")
    if(file.exists(fileName)){
        unlink(fileName)
    }

    write(paste("Package:", pkgName,
                "\nTitle: A data package containing",
                "annotation data for", pkgName,
                "\nVersion:", version,
                "\nCreated:", date(),
                "\nAuthor:", author[["author"]],
                "\nDescription: Annotation data file for",
                pkgName, "assembled using data\n   from",
                dataSrc,
                "\nMaintainer:", author[["author"]], "<",
                author[["maintainer"]], ">",
                "\nLicense:", license, "\n"), file = fileName, append = FALSE)
}


getDSrc <- function(organism){
    switch(tolower(organism),
           "human" = ,
           "rat" = ,
           "mouse" = return(c("LL", "GO", "KEGG", "GP", "UG")),
           "yeast" = return(c("KEGG", "GO", "YG")),
           "kegg" = return("KEGG"),
           "go" = return("GO"),
           "hg" = return("HG"),
           stop(paste("Organism", organism, "not supported!")))
}

# This function searches the package to be built for names of the rda
# files already created
getAllRdaName <- function(pkgName, pkgPath){
    rdaNames <- gsub(paste("(^", pkgName, "|.rda$)", sep = ""), "",
                     list.files(file.path(pkgPath, pkgName, "data")))
    return(rdaNames[!is.element(rdaNames, c("QC", "print.ABQCList"))])
}

writeFun <- function (pkgPath, pkgName, organism = "human"){
    fileName <- file.path(pkgPath, pkgName, "R",
                          paste(pkgName, ".R", sep = ""))

    toWrite <- paste(pkgName, " <- function(){\n",
                     "load(file.path(.path.package(\"", pkgName, "\"), ",
                     "\"data\", \"", paste(pkgName, "QC.rda", sep = ""),
                     "\"))\ncat(", paste(pkgName, "QC", sep = ""),
                     ")}", sep = "")

   write(toWrite, file = fileName)
}

writeZZZ <- function(pkgPath, pkgName){
    fileName <- file.path(pkgPath, pkgName, "R", "zzz.R")
    copySubstitute(file.path(.path.package("AnnBuilder"), "scripts",
                             "dataPkgLoad.R"), fileName,
                   list(PACKAGE = pkgName))
}

#putPrettyPrint <- function(pkgName, pkgPath){
#    path <- file.path(.path.package("AnnBuilder"))
#    dump("print.ABQCList", file = file.path(pkgPath, pkgName, "R",
#                           "print.ABQCList.R"))
#    manPage <- readLines(file.path(path, "data", "print.ABQCList.Rd"))
#    write(manPage, file = file.path(pkgPath, pkgName, "man",
#    "print.ABQCList.Rd"))
#}


#getText4Desc <- function(pkgName, manName){
    # This is not used but still kept here
#    description <- paste("This is an R rda file (a external ",
#                         "representation of R objects that canbe ",
#                         "read back from at a later date by using ",
#                         "the function `load' (or `data' in some ",
#                         "cases)) that contains an R environment ",
#                         "object with key-value paris. For a ",
#                         "given key, the value is a vector with a ",
#                         "single or multiple elements depending ",
#                             "on whether the key can be mapped to ",
#                         "only one or a group of values",
#                         "\n\nAll the funcitons that can be ",
#                         "applied to an R environment is ",
#                         "applicable to the data file. For ",
#                         "example, ls(", paste(pkgName, manName,
#                                               sep = ""), ") ",
#                         "returns a vector containing all the ",
#                             "keys and get(", "keyvalue", "env = ",
#                         paste(pkgName, manName, sep = ""),
#                         " returns a vector containg the ",
#                         "value(s) corresponding to the key ",
#                         "(keyvalue). multiget(c(\"key1\", ",
#                         "\"key2\", ..., \"keyX\"), env = ",
#                         paste(pkgName, manName, sep = ""),
#                             ") returns a list with the ",
#                         "corresponding values as elements.", sep = "")
#    return()
#}

writeMan4QC <- function(pkgName, pkgPath){

    path <- file.path(pkgPath, pkgName, "man")
    filename <- file.path(pkgPath, pkgName, "man",
                           paste(pkgName, "QC.Rd", sep = ""))
    write(paste("\\name{", pkgName, "QC}\n",
                "\\alias{", pkgName, "QC}\n",
                "\\title{", "Quality control information for ",
                pkgName, "}\n",
                "\\description{\n A data file containing statistics ",
                "for all the data files in this package. The data ",
                "can be used for quality control purpose\n}\n",
                "\\details{\n", paste("This file contains quality control ",
                            "information that can be displayed by ",
                            "typing ", pkgName, "() after loading ",
                            "the package using library(", pkgName,
                            "). The follow items are included:\n\n",
                            "Date built - Date when the package was built.",
                            "\n\nNumber of probes - total number of ",
                            "probes included\n\n", "Probe number ",
                            "missmatch - if the total number of probes",
                            " of any of the data file is different ",
                            "from a base file used to check the data files",
                            " the name of the data file will be listed\n\n",
                            "Probe missmatch - if any of probes in a data",
                            " file missmatched that of the base file, ",
                            " the name of the data file will be listed\n\n",
                            " Mappings found for probe based files - ",
                            " number of mappings obtained for the total ",
                            "number of probes\n\n", "Mappings found for ",
                            "non-probe based files - total number of ",
                            "mappings obtained", sep = ""), "\n}\n",
                "\\keyword{datasets}\n", sep = ""), filename)
}

# Get build information for yeast genome source data
getBuild4Yeast <- function(src, manName){
    if(is.element(manName, c("GENENAME", "PMID", "GO", "ORF", "CHR",
                             "CHRLOC", "DESCRIPTION", "CHRLENGTHS",
                             "GO2PROBE", "GO2ALLPROBES", "ALIAS"))){
        if(manName != "DESCRIPTION"){
            probeDesc <- " PROBE ids are Open Reading Frame (ORF) ids."
        }else{
            probeDesc <- ""
        }
        return(paste("\\details{\n", "Mappings were based on data ",
                     "provided by Yeast Genome project.", probeDesc, "\n\n",
                     paste(paste("Source data built:",
                     "Yeast Genome data are built at various time ",
                     "intervals. Sources used were downloaded ",
                     date(), "\n",
                     paste("Package built:", date()), sep = "")), "\n}\n",
                     "\\references{\n", paste("\\url{", getYGUrl(), "}",
                                              sep = ""), "\n}\n", sep = ""))
    }else{
        return(getSrcBuiltNRef(src))
    }
}

# This function writes header information to an XML file.
#
# Copyright 2002 J. Zhang, all rights reserved.
#
writeXMLHeader <- function (outName, fileCol, name, version,
                            organism = "human"){

    writeLine <- function(toWrite){
        write(x = toWrite, file = outName, append = TRUE)
    }

    writeLine(paste("<?xml version = \"1.0\" encoding = ",
                       "\"UTF-8\" standalone = \"yes\"?>", sep = ""))

    writeLine(paste("<!DOCTYPE AnnBuilder: SYSTEM ",
                        "\"http://www.bioconductor.org/datafiles/dtds/",
                        "annotate.dtd\">", sep = ""))

    writeLine(paste("<AnnBuilder:Annotate xmlns:AnnBuilder = ",
                       "'http://www.bioconductor.org/AnnBuilder/'>",
                       sep = ""))

    writeLine("<AnnBuilder:Attr>")
    writeLine(paste("<AnnBuilder:Target value = \"", name,
                    "\"/>", sep = ""))
    writeLine(paste("<AnnBuilder:DateMade value = \"", date(),
                    "\"/>", sep = ""))
    writeLine(paste("<AnnBuilder:Version value = \"", version,
                    "\"/>", sep = ""))

    for (i in getDSrc(organism)){
        statement <- paste("<AnnBuilder:SourceFile url = \"", getSrcUrl(i),
                           "\" built = \"", getSrcBuilt(i), "\"/>", sep = "")
        writeLine(statement)
    }

#    load(file.path(.path.package("AnnBuilder"), "data", "AnnInfo.rda"))
    statement <- paste("<AnnBuilder:Entryid value = \"",
                       get(fileCol[1], AnnInfo)$short, "\"/>", sep = "")
    writeLine(statement)

    for(i in 2:length(fileCol)){
        statement <- paste("<AnnBuilder:Element value = \"",
                           fileCol[i], "\" describ = \"",
                           get(fileCol[i], AnnInfo)$short, "\"/>", sep = "")
        writeLine(statement)
    }

    writeLine("</AnnBuilder:Attr>")
}









# Get a matrix with Affymetrix
yeastAnn <- function(base = "",
    yGenoUrl = "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/",
    yGenoNames = c("literature_curation/gene_literature.tab",
    "chromosomal_feature/chromosomal_feature.tab",
    "literature_curation/gene_association.sgd"), toKeep =
                     list(c(6, 1), c(9, 2, 5, 6, 8, 11, 3), c(2, 5, 7)),
                     colNames = list(c("sgdid", "pmid"),
                     c("sgdid", "genename", "chr", "chrloc", "chrori",
                       "description", "alias"),
                     c("sgdid", "go", "evi")),
                     seps = c("\t", "\t", "\t"),
                     by = "sgdid"){

    # Each file should have matching parsing parameters
    if(any(c(length(toKeep), length(seps), length(colNames)) !=
           length(yGenoNames))){
        stop("Lengths of fileNames, toKeep, and seps have to be the same")
    }

    temp1 <- getProbe2SGD(base, yGenoUrl = yGenoUrl)
    # Creat a yeastGeno object
    ygeno <- YG(srcUrl = yGenoUrl)
    # merge temp1 with each of the annotation data defined by fileNames
    for(i in 1:length(yGenoNames)){
        temp <- readData(ygeno, yGenoNames[i], toKeep[[i]], seps[i])
        if(gsub(".*gene_associstion.sgd", "go", yGenoNames[i]) == "go"){
            colnames(temp) <- colNames[[i]]
             # Remove the leading and ending space
            dataCName <- colnames(temp)
            temp <- t(apply(temp, 1, function(x) gsub("^ | $", "", x)))
            colnames(temp) <- dataCName
            temp[,"go"] <- formatGO(temp[,"go"], temp[,"evi"])
            temp <- temp[, setdiff(colNames[[i]], "evi")]

        }else if(gsub(".*chromosomal.*tab$", "chr", yGenoNames[i]) == "chr"){
            colnames(temp) <- colNames[[i]]
            # Remove the leading and ending space
            dataCName <- colnames(temp)
            temp <- t(apply(temp, 1, function(x) gsub("^ | $", "", x)))
            colnames(temp) <- dataCName
            temp[,"chrloc"] <- formatChrLoc(temp[,"chr"], temp[,"chrloc"],
                                            temp[,"chrori"])
            temp[, "alias"] <- gsub("\\|", ";", temp[, "alias"])
            temp <- temp[, setdiff(colNames[[i]], "chrori")]
        }else{
            colnames(temp) <- colNames[[i]]
             # Remove the leading and ending space
            dataCName <- colnames(temp)
            temp <- t(apply(temp, 1, function(x) gsub("^ | $", "", x)))
            colnames(temp) <- dataCName
        }
        # Only merge ones mapped to SGD ids
        temp1 <- merge(temp1, mergeRowByKey(temp), by = by, all.x = TRUE)
#        temp1 <- (temp1[!duplicated(temp1[,by]) |
#                                         !duplicated(temp1[,"probe"]),])
    }
    # Return data with ORF removed
    return(temp1[, -match(colnames(temp1), by)])
}
# Gets mappings between ORF to sgdid
getProbe2SGD <- function(probe2ORF = "",
                     yGenoUrl =
                     "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/",
                     fileName = "literature_curation/orf_geneontology.tab",
                     toKeep = c(1, 7), colNames = c("orf","sgdid"),
                     sep = "\t", by = "orf"){
    yGeno <- YG(srcUrl = yGenoUrl)
    temp <- readData(yGeno, fileName, toKeep, sep)
    colnames(temp) <- colNames
    if(probe2ORF == ""){
        colnames(temp) <- c("probe", "sgdid")
        return(temp)
    }else{
        probe2SGD <- merge(probe2ORF, temp, by = by, all.x = TRUE)
        # Remove the duplicates. Keep the first one
        probe2SGD <- (probe2SGD[!duplicated(probe2SGD[,1])])
        return(probe2SGD)
    }
}

# A function to extract yeast annotation data from
# "ftp://genome-ftp.stanford.edu/pub/yeast/data_download".
#
# Data will be extracted from four files each with an column for
# source specific ids linking the data together.
#
# Copyright 2003, J. Zhang, all rights reserved.
#
procYeastGeno <- function(baseURL =
                "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/",
                fileName, toKeep, colNames, seps = "\t"){

    # Parse each file and merge based on source specified ids
    merged <- NULL
    # Creat a yeastGeno object
    ygeno <- yeastGeno(srcUrl = baseUrl)
    for(i in 1:length(fileNames)){
        temp <- readData(ygeno, fileNames[i], toKeep[[i]], seps[i])
        # The first row is for column name. Remove it
        temp <- temp[-1,]
        colnames(temp) <- colNames[[i]]
        if(is.null(merged)){
            merged <- temp
        }else{
            # Only merge the ones mapped to GenBank accessin number
            merged <- merge(merged, temp, by = "id", all = TRUE)
        }
    }
    # Drop the source specific id
    return(merged)
}
# GO ids from yeast data source do not have the leading GO and 0s. Put
# them back
formatGO <- function(gos, evis){
    add0 <- function(go){
        num <- 7 - nchar(go)
        return(paste("GO:", paste(rep("0", num), sep = "",
                                  collapse = ""), go, sep = "", collapse = ""))
    }
    gos <- sapply(gos, add0)
    return(paste(gos, evis, sep = "@"))
}
# Name chromosomal locations by chromosome number and add + for W and
# - for C strand
formatChrLoc <- function(chr, chrloc, chrori){
    chrori <- sapply(chrori, function(x) ifelse(x == "W", "+", "-"))
    chrloc <- paste(chrori, chrloc, "@", chr, sep = "")
    return(chrloc)
}

# Get the data that map Affymetrix probe ids to SGD_ORF identifiers
# that in trun can be mapped to annotation data using the source from
# yeast genomic web site.
getGEOYeast <- function(GEOAccNum, GEOUrl =
    "http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?",
                     geoCols = c(1, 8), yGenoUrl =
    "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/"){

    geo <- GEO(GEOUrl)
    geoData <- readData(geo, GEOAccNum)
    # Read the file with mappigs between Affymetrix probe ids and
    # SGD_ORF identifiers
    geoData <- geoData[,geoCols]
    colnames(geoData) <- c("probe", "orf")
    return(geoData)
}

# Constructors an object of GEO
#

YG <- function(srcUrl =
                  "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/",
                      parser = "", baseFile = ""){

    new("YG", srcUrl = srcUrl, parser = parser, baseFile = baseFile)
}

# Extracts the speficed columns from yeast genomic data. srcUrl is the base
# url for the ftp site, extenName is the extension for a target file
# name, cols2Keep is a vector for the number of data columns to keep,
# and sep is the delimiter used by the file.
getYeastData <- function(url, extenName, cols2Keep, sep){
    conn <- url(paste(url, extenName, sep = ""), open = "r")
    options(show.error.messages = FALSE)
    # Try read table first
    data <- try(read.table(conn, header = FALSE, sep = sep, quote = "",
                           as.is = TRUE, comment.char = ""))
    options(show.error.messages = TRUE)
    close(conn)
    # Has to use readLines as the source data are not well structured
    # and columns may missing if data = "try-error"
    if(inherits(data, "try-error")){
        data <- readBadData(paste(url, extenName, sep = ""), sep)
    }
    data <- as.matrix(data)
    return(data[, cols2Keep])
}

# Some of the yest data files are badly structured (e.g. missing
# columns) and have to be processed this way
readBadData <- function(url, sep){
     conn <- url(url, open = "r")
     data <- readLines(conn)
     close(conn)
     if(length(data) <= 10){
         colNum <- findNumCol(data, sep = sep)
     }else{
         colNum <- findNumCol(data[1:10], sep = sep)
     }

     data <- as.matrix(sapply(data, strsplit, sep))
     lengths <- as.vector(sapply(data, length))
     data <- unlist(data[lengths == colNum], use.names = FALSE)

     return(matrix(data, ncol = colNum, byrow = TRUE))
}

# Figures out the number of columns based on a section of a file
findNumCol <- function(fewLines, sep){
    lengths <- sapply(sapply(fewLines, strsplit, sep), length)
    return(max(lengths[duplicated(lengths)]))
}

# This function builds a data package for yeast geno.

yeastPkgBuilder <- function(pkgName, pkgPath, base = "", srcUrls = c(KEGG
                            = getSrcUrl("KEGG", organism = "yeast"),
                            GO = getSrcUrl("GO")), version = "1.1.0",
                            makeXML = TRUE, author = list(author = "who",
                            maintainer = "who@email.com"), fromWeb =
                            TRUE){

    getUniCol4Yeast <- function(){
        if(base != ""){
            return(c("PMID", "ORF", "CHR", "DESCRIPTION", "GENENAME"))
        }else{
            return(c("PMID", "CHR", "DESCRIPTION", "GENENAME"))
        }
    }
    getMultCol4Yeast <- function(){
        return("ALIAS")
    }

    if(!is.na(srcUrls["KEGG"])){
        kegg <- KEGG(srcUrl = srcUrls["KEGG"], organism = "yeast")
    }
    if(!is.na(srcUrls["GO"])){
        go <- GO(srcUrl = srcUrls["GO"])
    }

    makeSrcInfo()
    createEmptyDPkg(pkgName, pkgPath, force = TRUE)
    annotation <- yeastAnn(base)
    annotation <- as.matrix(annotation)
    colnames(annotation) <- toupper(colnames(annotation))
    if(makeXML){
        multC <- c("PMID", "ALIAS")
        typeC <- c("GO", "CHRLOC")
        XMLOut <- file.path(pkgPath, paste(pkgName, ".xml", sep = ""))
        fileToXML(targetName = pkgName, outName = XMLOut,
                  inName = annotation[, colnames(annotation) != "EVI"],
                  colNames = "", idColName = "PROBE", multColNames = multC,
                  typeColNames = typeC, isFile = FALSE, version = version)
    }
    # Write elements that has one to one mappings
    for(i in getUniCol4Yeast()){
        env <- new.env(hash = TRUE, parent = NULL)
        multiassign(annotation[,"PROBE"], as.vector(annotation[,i]), env)
        assign(paste(pkgName, i, sep = ""), env)
        save(list = paste(pkgName, i, sep = ""), file =  file.path(pkgPath,
                      pkgName, "data", paste(pkgName, i, ".rda", sep = "")))
    }
    # Write elements that has one to many mappings
    for(i in getMultCol4Yeast()){
        env <- new.env(hash = TRUE, parent = NULL)
        multiassign(annotation[,"PROBE"],
               lapply(annotation[,i], splitEntry), env)
        assign(paste(pkgName, i, sep = ""), env)
        save(list = paste(pkgName, i, sep = ""), file =  file.path(pkgPath,
             pkgName, "data", paste(pkgName, i, ".rda", sep = "")))
    }
    # GO and CHRLOC have evidence code and chromosome number
    # associated with each entry that needs special attention
    for(i in c("GO", "CHRLOC")){
        if(i == "CHRLOC"){
            num <- TRUE
        }else{
            num <- FALSE
        }
        env <- new.env(hash = TRUE, parent = NULL)
        multiassign(annotation[, "PROBE"],
                    lapply(annotation[, i], twoStepSplit,
                               asNumeric = num), env)
        assign(paste(pkgName, i, sep = ""), env)
        save(list = paste(pkgName, i, sep = ""), file =  file.path(pkgPath,
                     pkgName, "data", paste(pkgName, i, ".rda", sep = "")))
    }
    env <- cols2Env(annotation[,c("PMID", "PROBE")],
                    colNames = c("PMID", "PROBE"), keyColName = "PMID")
    assign(paste(pkgName, "PMID2PROBE", sep = ""), env)
    save(list = paste(pkgName, "PMID2PROBE", sep = ""),
         file =  file.path(pkgPath, pkgName, "data",
         paste(pkgName, "PMID2PROBE.rda", sep = "")))
    if(!is.na(srcUrls["KEGG"])){
        if(base == ""){
            xkey <- "PROBE"
        }else{
            xkey <- "ORF"
        }
        # Adding KEGG pathway data to annoataion
        pathNEnzyme <- mapLL2ECNPName(kegg)
        orfNEC <- pathNEnzyme$llec
        orfNPath <- pathNEnzyme$llpathname
        colnames(orfNEC) <- c("ORF", "ENZYME")
        colnames(orfNPath) <- c("ORF", "PATH")
        annotation <- merge(annotation, orfNPath, by.x = xkey,
                            by.y = "ORF", all.x = TRUE)
        annotation <- merge(annotation, orfNEC, by.x = xkey,
                            by.y = "ORF", all.x = TRUE)
        env <- cols2Env(annotation[,c("PATH", "PROBE")],
                        colNames = c("PATH", "PROBE"), keyColName = "PATH")
        assign(paste(pkgName, "PATH2PROBE", sep = ""), env)
        save(list = paste(pkgName, "PATH2PROBE", sep = ""),
             file =  file.path(pkgPath, pkgName, "data",
             paste(pkgName, "PATH2PROBE.rda", sep = "")))
        env <- cols2Env(annotation[,c("ENZYME", "PROBE")],
                        colNames = c("ENZYME", "PROBE"),
                        keyColName = "ENZYME")
        assign(paste(pkgName, "ENZYME2PROBE", sep = ""), env)
        save(list = paste(pkgName, "ENZYME2PROBE", sep = ""),
             file =  file.path(pkgPath, pkgName, "data",
             paste(pkgName, "ENZYME2PROBE.rda", sep = "")))
    }
    go2Probe <- new.env(hash = TRUE, parent = NULL)
    multiassign(as.vector(annotation[, "PROBE"]),
                    lapply(annotation[, "GO"], twoStepSplit,
                               asNumeric = FALSE), go2Probe)
    go2Probe <- unlist(multiget(ls(go2Probe), go2Probe))
    go2Probe  <- mergeRowByKey(cbind(go2Probe,
                                     gsub("\\..*", "", names(go2Probe))))
    colnames(go2Probe) <- c("GO", "GO2PROBE")
    # Write GO id to probe id mappings to data package
    env <- new.env(hash = TRUE, parent = NULL)
    multiassign(as.vector(go2Probe[, 1]),
                lapply(go2Probe[, 2], splitEntry), env)
    assign(paste(pkgName, "GO2PROBE", sep = ""), env)
    save(list = paste(pkgName, "GO2PROBE", sep = ""),
         file =  file.path(pkgPath, pkgName, "data",
         paste(pkgName, "GO2PROBE.rda", sep = "")))
    # Get the mappings between GO ids to all the probe ids (including
    # children)
    options(show.error.messages = FALSE)
    goData <- try(readData(go, xml = TRUE, fromWeb = fromWeb))
    options(show.error.messages = TRUE)
    go2All <- mapGO2AllProbe(go2Probe, goData, "", sep = ";", all = TRUE)
    # colnames(go2All) <- c("GO", "GO2ALLPROBE")
    # Write GO id to all probe ids mappings to data package
    env <- new.env(hash = TRUE, parent = NULL)
    multiassign(names(go2All), sapply(go2All, splitEntry), env)
    assign(paste(pkgName, "GO2ALLPROBES", sep = ""), env)
    save(list = paste(pkgName, "GO2ALLPROBES", sep = ""),
         file =  file.path(pkgPath, pkgName, "data",
         paste(pkgName, "GO2ALLPROBES.rda", sep = "")))
    go2All <- cbind(names(go2All), go2All)
    colnames(go2All) <- c("GO", "GO2ALLPROBES")
    if(makeXML){
        if(!is.null(go2All) && !is.null(go2Probe)){
            mergedGO <- as.matrix(merge(as.matrix(go2All),
                          as.matrix(go2Probe), by = "GO", all.x = TRUE))
            XMLByNum <- file.path(pkgPath, paste(pkgName, "ByNum.xml",
                                                 sep = ""))
            fileToXML(targetName = "GOByNum", outName = XMLByNum,
                  inName = mergedGO, colNames = "",
                  idColName = "GO", multColNames = c("GO2ALLPROBES",
                                    "GO2PROBE"),
                  typeColNames = "", isFile = FALSE, version = version)
        }
    }
    writeAccessory(pkgName, pkgPath, "yeast", version, author,
                   dataSrc = "Stanford Yeast Genome Project")
    # Write the quality control data
    # Quality control
    if(base == ""){
        base <- getProbe2SGD()
    }
    getDPStats(base, pkgName, pkgPath, isFile = FALSE)
    writeMan4QC(pkgName, pkgPath)
    chrLengths <- findYGChrLength()
    writeOrganism(pkgName, pkgPath, "yeast")
    writeChrLength(pkgName, pkgPath, chrLengths)
    writeManPage(pkgName, pkgPath, "CHRLENGTHS", organism = "yeast",
                 src = "YG", isEnv = FALSE)
    writeManPage(pkgName, pkgPath, "ORGANISM", organism = "yeast",
                 src = NA, isEnv = FALSE)
}

findYGChrLength <- function(yGenoUrl =
    "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/",
    yGenoName = "chromosomal_feature/chromosome_length.tab",
    toKeep = c(1, 3), sep = "\t"){

    ygeno <- YG(srcUrl = yGenoUrl)
    temp <- readData(ygeno, yGenoName, toKeep, sep)
    # Remove the leading and ending space
    temp <- t(apply(temp, 1, function(x) gsub("^ | $", "", x)))
    chrLengths <- temp[,2]
    names(chrLengths) <- temp[,1]

    return(chrLengths)
}
