SplitProbDup
splits an object of class ProbDup
into two on the
basis of set counts.
SplitProbDup(pdup, splitat = c(30, 30, 30))
An object of class ProbDup
.
A vector of 3 integers indicating the set count at which
Fuzzy, Phonetic and Semantic duplicate sets in pdup
are to be split.
A list with the the divided objects of class ProbDup
(pdup1
and pdup2
) along with the corresponding lists of
accessions present in each (list1
and list2
).
# \dontshow{
threads_dt <- data.table::getDTthreads()
threads_OMP <- Sys.getenv("OMP_THREAD_LIMIT")
data.table::setDTthreads(2)
data.table::setDTthreads(2)
Sys.setenv(`OMP_THREAD_LIMIT` = 2)
# }
if (FALSE) {
# Load PGR passport database
GN <- GN1000
# Specify as a vector the database fields to be used
GNfields <- c("NationalID", "CollNo", "DonorID", "OtherID1", "OtherID2")
# Clean the data
GN[GNfields] <- lapply(GN[GNfields], function(x) DataClean(x))
y1 <- list(c("Gujarat", "Dwarf"), c("Castle", "Cary"), c("Small", "Japan"),
c("Big", "Japan"), c("Mani", "Blanco"), c("Uganda", "Erect"),
c("Mota", "Company"))
y2 <- c("Dark", "Light", "Small", "Improved", "Punjab", "SAM")
y3 <- c("Local", "Bold", "Cary", "Mutant", "Runner", "Giant", "No.",
"Bunch", "Peanut")
GN[GNfields] <- lapply(GN[GNfields], function(x) MergeKW(x, y1, delim = c("space", "dash")))
GN[GNfields] <- lapply(GN[GNfields], function(x) MergePrefix(x, y2, delim = c("space", "dash")))
GN[GNfields] <- lapply(GN[GNfields], function(x) MergeSuffix(x, y3, delim = c("space", "dash")))
# Generate KWIC index
GNKWIC <- KWIC(GN, GNfields)
# Specify the exceptions as a vector
exep <- c("A", "B", "BIG", "BOLD", "BUNCH", "C", "COMPANY", "CULTURE",
"DARK", "E", "EARLY", "EC", "ERECT", "EXOTIC", "FLESH", "GROUNDNUT",
"GUTHUKAI", "IMPROVED", "K", "KUTHUKADAL", "KUTHUKAI", "LARGE",
"LIGHT", "LOCAL", "OF", "OVERO", "P", "PEANUT", "PURPLE", "R",
"RED", "RUNNER", "S1", "SAM", "SMALL", "SPANISH", "TAN", "TYPE",
"U", "VALENCIA", "VIRGINIA", "WHITE")
# Specify the synsets as a list
syn <- list(c("CHANDRA", "AH114"), c("TG1", "VIKRAM"))
# Fetch probable duplicate sets
GNdup <- ProbDup(kwic1 = GNKWIC, method = "a", excep = exep, fuzzy = TRUE,
phonetic = TRUE, encoding = "primary",
semantic = TRUE, syn = syn)
# Split the probable duplicate sets
GNdupSplit <- SplitProbDup(GNdup, splitat = c(10, 10, 10))
}
# \dontshow{
data.table::setDTthreads(threads_dt)
Sys.setenv(`OMP_THREAD_LIMIT` = threads_OMP)
# }