word_to_num <- function(x) {
# normalize
x <- tolower(x)
# direct numbers
if (grepl("\\b\\d+\\b", x)) return(as.integer(regmatches(x, regexpr("\\b\\d+\\b", x))))
# hyphenated like "5-0"
if (grepl("\\b\\d+\\s*-\\s*\\d+\\b", x)) {
parts <- as.integer(unlist(strsplit(regmatches(x, regexpr("\\b\\d+\\s*-\\s*\\d+\\b", x)), "\\s*-\\s*")))
return(10 * parts[1] + parts[2])
}
# simple word numbers
ones <- c(
zero=0, one=1, two=2, three=3, four=4, five=5, six=6, seven=7, eight=8, nine=9,
ten=10, eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15, sixteen=16,
seventeen=17, eighteen=18, nineteen=19
)
tens <- c(twenty=20, thirty=30, forty=40, fifty=50, sixty=60, seventy=70, eighty=80, ninety=90)
# e.g., "nineteen"
if (x %in% names(ones)) return(ones[[x]])
# e.g., "thirty five" or "thirty-five"
x2 <- gsub("-", " ", x)
parts <- strsplit(x2, "\\s+")[[1]]
if (length(parts) == 2 && parts[1] %in% names(tens) && parts[2] %in% names(ones)) {
return(tens[[parts[1]]] + ones[[parts[2]]])
}
if (length(parts) == 1 && parts[1] %in% names(tens)) return(tens[[parts[1]]])
return(NA_integer_)
}
# Extract name candidates
extract_name <- function(s) {
# patterns that introduce a name
pats <- c(
"I go by\\s+([A-Z][a-z]+)",
"I'm\\s+([A-Z][a-z]+(?:\\s+[A-Z][a-z]+)?)",
"They call me\\s+([A-Z][a-z]+(?:\\s+[A-Z][a-z]+)?)",
"^([A-Z][a-z]+) here",
"The name's\\s+([A-Z][a-z]+)",
"^([A-Z][a-z]+)\\s" # fallback: leading capital word
)
for (p in pats) {
m <- regexpr(p, s, perl = TRUE)
if (m[1] != -1) {
return(sub(p, "\\1", regmatches(s, m)))
}
}
NA_character_
}
# Extract age phrases and convert to number
extract_age <- function(s) {
# capture common age phrases around a number
m <- regexpr("(\\b\\d+\\b|\\b\\d+\\s*-\\s*\\d+\\b|\\b[Nn][a-z-]+\\b)\\s*(years|year|birthday|young|this)", s, perl = TRUE)
if (m[1] != -1) {
token <- sub("(years|year|birthday|young|this)$", "", trimws(substring(s, m, m + attr(m, "match.length") - 1)))
return(word_to_num(token))
}
# handle pure word-number without trailing keyword (e.g., "Nineteen years young." handled above)
m2 <- regexpr("\\b([A-Z][a-z]+)\\b\\s+years", s, perl = TRUE)
if (m2[1] != -1) {
token <- tolower(sub("\\s+years.*", "", regmatches(s, m2)))
return(word_to_num(token))
}
# handle hyphenated "big 5-0"
m3 <- regexpr("big\\s+(\\d+\\s*-\\s*\\d+)", s, perl = TRUE)
if (m3[1] != -1) {
token <- sub("big\\s+", "", regmatches(s, m3))
return(word_to_num(token))
}
NA_integer_
}