ps: just found THIS very informative presentation on regex.
str <- c("i.e., George W. Bush", "Lyndon B. Johnson, etc.") gsub("([A-Z])[.]?", "\\1", str) # this will find abbreviated names and remove the fullstops: # the uppercase letters followed by a full-stop are matched by # [A-Z][.]? = repeated at most once. the parentheses delineate a # back reference, i.e. the uppercase letter, which will be # replaced by \\1 which is the first backreference. # output: [1] "i.e., George W Bush" "Lyndon B Johnson, etc." str <- c("George W. Bush", "Lyndon B. Johnson") sub(" .*", "", str) # keeps the first word and removes the rest. # matches and replaces the substring comprised of the first # white space followed by any single character, # designated by the period, repeated zero or more times, as # given by the asterisk. # output: [1] "George" "Lyndon" sub("\\s\\w+$", "", str) # removes the last word plus the preceding space in a string. # looks for a space followed by any word which is the last one: # the dollar sign $ is a meta-character that matches the # beginning and end of a line. # output: [1] "George W." "Lyndon B." sub(".*\\s(\\w+$)", "\\1", str) # keep only the last word of a string. # looks for anything, repeated arbitrarily often followed by a # space ".*\\" and a word which is the last in line. # for this word you put brackets for a back-reference, which is # returned by "\\1", the 1st back-reference. # output: [1] "Bush" "Johnson" str <- c("&George W. Bush", "Lyndon B. Johnson?") gsub("[^[:alnum:][:space:].]", "", str) # keep alphanumeric signs AND full-stop, remove anything else, # that is, all other punctuation. what should not be matched is # designated by the caret. # output: [1] "George W. Bush" "Lyndon B. Johnson"
No comments :
Post a Comment