Finding anomalies in data

# make test data

# need a date sequence
# 
library(ggvis)
library(dplyr)
library(tidyr)
library(plotly)

# library(Hmisc)
# library(broom)
library(htmlwidgets)
library(AnomalyDetection)
set.seed(10)

dates <- seq(as.Date("2014-01-15"), by = "month", length.out = 36)

# annual oscillation , peak in summer, low in winter + upwards trend and some noise
# 


obs <- data.frame(date = dates, value= 
                    as.numeric(  - 50 *  cos( 2 * pi * as.integer(dates) / 365 ) + 60 
                                 + (dates - dates[1]) * 10 / 165  + rnorm(length(dates),mean = 10, sd = 20)),
                  type = rep('obs',length(dates)))

# sudden absolute change + 1000 after July 2014
# 
obs2 <- obs %>% mutate(value = ifelse(date > as.Date('2014-07-01'), value + 200, value),
                       type = 'obs2')
# one big spike
#
obs3 <- obs %>% mutate(value = ifelse(date == as.Date('2015-07-15'), value * 10, value),
                       type = 'obs3')
# magnified after July 2015
#
obs4 <- obs %>% mutate(value = ifelse(date > as.Date('2015-07-01'), value * 4, value),
                       type = 'obs4')
# magnified after July 2015
#
obs5 <- obs %>% mutate(value = ifelse(date > as.Date('2015-07-01'), value * 10, value),
                       type = 'obs5')
# combine all data
obs <- bind_rows(obs,obs2,obs3,obs4,obs5)

# plot them all
gp <- ggplot(obs, aes(date,value)) + geom_line( aes(colour=type))
ggplotly(gp)
saveWidget( ggplotly(gp), 'testData.html')
# function to check for anomalies

testForAnomaly <- function( otype)
{
  #print(otype)
  result <- NA
  mObs <- obs %>% filter(type == otype) %>%
    select(date,value)
  result <- AnomalyDetectionTs(mObs, max_anoms=0.01, 
                               direction='both',
                               piecewise_median_period_weeks = 4, 
                               plot=F)$anoms

  result <- as.data.frame(result)
  result <- merge(result,as.data.frame(otype),all=T)

  return(result)
}

# test function
types <- unique(obs$type)

#testForAnomaly(types[4])


library(parallel) # only works on Linux
system.time(
  test <- do.call(rbind, mcmapply(testForAnomaly, types,mc.cores = 32)) # linux line
  #test <- do.call(rbind, mcmapply(testForAnomaly, types,mc.cores = 1))
)

# display test data
test

# find 'single' spikes
# candidate types
candidates <- unique(test$otype)
# work out the median for each type
median <- obs %>% filter(type  %in% candidates & date > '2015-10-01') %>%
  group_by(type) %>% summarise(medianValue = median(value))

# merge the anomaly on
obsWithMedian <- (merge(median, test, by.x='type', by.y = 'otype', all = T))
# somes stupid date formatting
obsWithMedian$timestamp <- as.Date(obsWithMedian$timestamp)
# find types that have anomalies greater than 10 times the median
lookInMoreDetail <- (obsWithMedian %>% group_by(type) %>% filter(anoms > 10 * medianValue))

lookInMoreDetail

## end
Advertisement