Finding anomalies in data
# make test data
# need a date sequence
#
library(ggvis)
library(dplyr)
library(tidyr)
library(plotly)
# library(Hmisc)
# library(broom)
library(htmlwidgets)
library(AnomalyDetection)
set.seed(10)
dates <- seq(as.Date("2014-01-15"), by = "month", length.out = 36)
# annual oscillation , peak in summer, low in winter + upwards trend and some noise
#
obs <- data.frame(date = dates, value=
as.numeric( - 50 * cos( 2 * pi * as.integer(dates) / 365 ) + 60
+ (dates - dates[1]) * 10 / 165 + rnorm(length(dates),mean = 10, sd = 20)),
type = rep('obs',length(dates)))
# sudden absolute change + 1000 after July 2014
#
obs2 <- obs %>% mutate(value = ifelse(date > as.Date('2014-07-01'), value + 200, value),
type = 'obs2')
# one big spike
#
obs3 <- obs %>% mutate(value = ifelse(date == as.Date('2015-07-15'), value * 10, value),
type = 'obs3')
# magnified after July 2015
#
obs4 <- obs %>% mutate(value = ifelse(date > as.Date('2015-07-01'), value * 4, value),
type = 'obs4')
# magnified after July 2015
#
obs5 <- obs %>% mutate(value = ifelse(date > as.Date('2015-07-01'), value * 10, value),
type = 'obs5')
# combine all data
obs <- bind_rows(obs,obs2,obs3,obs4,obs5)
# plot them all
gp <- ggplot(obs, aes(date,value)) + geom_line( aes(colour=type))
ggplotly(gp)
saveWidget( ggplotly(gp), 'testData.html')
# function to check for anomalies
testForAnomaly <- function( otype)
{
#print(otype)
result <- NA
mObs <- obs %>% filter(type == otype) %>%
select(date,value)
result <- AnomalyDetectionTs(mObs, max_anoms=0.01,
direction='both',
piecewise_median_period_weeks = 4,
plot=F)$anoms
result <- as.data.frame(result)
result <- merge(result,as.data.frame(otype),all=T)
return(result)
}
# test function
types <- unique(obs$type)
#testForAnomaly(types[4])
library(parallel) # only works on Linux
system.time(
test <- do.call(rbind, mcmapply(testForAnomaly, types,mc.cores = 32)) # linux line
#test <- do.call(rbind, mcmapply(testForAnomaly, types,mc.cores = 1))
)
# display test data
test
# find 'single' spikes
# candidate types
candidates <- unique(test$otype)
# work out the median for each type
median <- obs %>% filter(type %in% candidates & date > '2015-10-01') %>%
group_by(type) %>% summarise(medianValue = median(value))
# merge the anomaly on
obsWithMedian <- (merge(median, test, by.x='type', by.y = 'otype', all = T))
# somes stupid date formatting
obsWithMedian$timestamp <- as.Date(obsWithMedian$timestamp)
# find types that have anomalies greater than 10 times the median
lookInMoreDetail <- (obsWithMedian %>% group_by(type) %>% filter(anoms > 10 * medianValue))
lookInMoreDetail
## end
Like this:
Like Loading...