Lately I had a list of about 1000 species names and I wanted to filter out only the plants as that is where I come from. I knew that Scott Chamberlain has put together the ritis package which obviously can do such things. However, I knew of ITIS before and was keen to give it a shot..
Here's what I've come up with (using the ITIS API, updated on 11. Dec 2012, previous version had a flaw with indefinite matches.. Should be ok now. However, there are of course species that are not covered by the database, i.e. Ixodes, see below):
library(XML)
get_tsn <- function(sp_name) {
require(XML)
units <- tolower(unlist(strsplit(sp_name, " ")))
# valid string?
if (length(units) > 2) { stop("...No valid search string submitted (two words seperated by one space)!") }
itis_xml <- htmlParse(paste("http://www.itis.gov/ITISWebService/services/ITISService/searchByScientificName?srchKey=",
sp_name, sep=""))
tsn <- xpathSApply(itis_xml, "//tsn", xmlValue)
unitname1 <- tolower(gsub("\\s+", "", xpathSApply(itis_xml, "//unitname1", xmlValue)))
unitname2 <- tolower(gsub("\\s+", "", xpathSApply(itis_xml, "//unitname2", xmlValue)))
unitname3 <- tolower(gsub("\\s+", "", xpathSApply(itis_xml, "//unitname3", xmlValue)))
# sp_name = only Genus, get tsn were sp_name matches perfectly and unitname2 (lower level taxon) is absent
if (length(units) == 1) {
return(tsn[tolower(sub("\\s+", "", unitname1)) == tolower(sp_name) & unitname2 == ""]) }
# sp_name = Genus and Epitheton, get tsn where both match perfectly and unitname3 (lower level taxon) is absent
if (length(units) == 2) {
return(tsn[unitname1 == units[1] &
unitname2 == units[2] &
nchar(unitname3) == 0]) }
}
get_kngdm <- function(tsn) {
kngdm <- xpathSApply(htmlParse(paste("http://www.itis.gov/ITISWebService/services/ITISService/getKingdomNameFromTSN?tsn=",
tsn, sep="")),
"//kingdomname", xmlValue)
return(kngdm)
}
get_tsn_kngdm <- function(x) {y = get_tsn(x)
z = get_kngdm(y)
return(list(Name = x, TSN = y, Kingdom = z))
}
# I had some API-related errors (I guess it was mysteriously not answering in
# some cases). I couldn't resolve this and thus implemented tryCatch
get_tsn_kngdm_try <- function(x) tryCatch(get_tsn_kngdm(x), error = function(e) NULL)
sp_names <- c("Clostridium", "Physcia", "Ixodes", "LYNX", "Homo sapiens", "Canis lupus")
system.time(result <- data.frame(do.call(rbind, lapply(sp_names, FUN = get_tsn_kngdm_try))))
result
system.time(result <- data.frame(do.call(rbind, lapply(sp_names, FUN = get_tsn_kngdm_try))))
#
# result
# User System verstrichen
# 1.54 0.01 33.66
# Name TSN Kingdom
# 1 Clostridium 555645 Monera
# 2 Physcia 14024 Fungi
# 3 Viola 22030 Plantae
# 4 Ixodes
# 5 LYNX 180581 Animalia
# 6 Homo sapiens 180092 Animalia
# 7 Canis lupus 180596 Animalia
#