Saturday, November 30

Home Brew Seasonal Adjustment

I have been using the R package stl() to undertake seasonal decompositions of time series data. However, it removes much more signal/noise than the approach used by by the Australian Bureau of Statistics. Because it produces a trend series that is much less noisy than that from the ABS, it is slower to identify turning points in the data (as can be seen in the next three charts).




I have looked around, but could not find another option that would run on my Apple Mac system. So I thought I would have a go at writing my own seasonal adjustment function (do it yourself seasonal adjustment). Before I get to my code, let's look at how it performed.




It appears to be much closer to the ABS benchmark. So on to the code. We will start with the Henderson moving average (which is used a number of times to smooth the series).

## henderson.R
## calculate a Henderson moving average

hma <- function(v, n)
{
    hmaSymmetricWeights <- function(n)
    {
        # caluclate the vector of symmetric weights
        # formula from ABS (2003), 'A Guide to Interpreting Time Series', page 41. 
        # returns a dictionary of symmetric Henderson weights indexed from 1 to n

        # calculate the constant denominator
        m <- as.integer((n-1)/2)
        m1 <- (m+1)*(m+1)
        m2 <- (m+2)*(m+2)
        d <- as.double(8*(m+2)*(m2-1)*(4*m2-1)*(4*m2-9)*(4*m2-25))

        # calculate the weights
        w <- rep(NA, n) # 1:n
        m3 <- (m+3)*(m+3)
        for( j in 0:m ) {
            j2 <- j*j
            v <- (315*(m1-j2)*(m2-j2)*(m3-j2)*(3*m2-11*j2-16))/d
            w[(m+1+j)] <- v
            if (j > 0)
                 w[(m+1-j)] <- v
        }
        return(w)
    }

    hmaAsymmetricWeights <- function(n, mw, w)
    {
        # calculate the asymmetric end-weights
        # formula from Mike Doherty (2001), 'The Surrogate Henderson Filters in X-11',
        # Aust, NZ J of Stat. 43(4), 2001, pp901-999
        # see formula (1) on page 903

        # returns a dictionary of asymmetrical weights from 1 to mw;
        # where mw is less than n, and
        # w is the dictionary of symmetric henderson weights indexed from 1 to n

        sumResidual <- sum(w[(mw+1):(n)])
        sumEnd <- 0.0
        for ( i in (mw+1):n )
            sumEnd <- sumEnd + ((i)-((mw+1)/2.0)) * w[i]

        ic <- 1.0
        if (n >= 13 && n < 15)
            ic <- 3.5
        if (n >= 15)
         ic <- 4.5

        b2s2 <- (4.0/pi)/(ic*ic)
        f1 <- as.double(sumResidual)/as.double((mw))

        u <- 1:mw
        for ( r in 1:mw ) {
            calc1 <- (r- (mw+1)/2.0) * b2s2
            calc2 <- 1 + ( mw*(mw-1)*(mw+1) /12.0 ) * b2s2
            u[r] <- w[r] + f1 + ( calc1 / calc2 ) * sumEnd
        }
        return (u)
    }
    
    # --- the main body of the function

    # test that n is an integer, odd and >= 5
    n <- as.integer(n[1]) # integer; not vectorised
    if(! is.integer(n) )
        stop("In hma(v, n), n must be an integer")
    if(n < 5)        
        stop("In hma(v, n), n must be >= 5")
    if( (n %% 2) == 0)
        stop("In hma(v, n), n must be odd")

    # confirm that v is a vector
    if ( !(is.vector(v) && is.atomic(v) && is.numeric(v)) )
        stop("In hma(v, n), v must be an atomic, numeric vector")

    # handle NA - need to think about this more
    if( any(is.na(v)) )
        stop("In hma(v, n), the vector, v, must not containe NA values")

    # calculate the symmetric weights
    weights <- hmaSymmetricWeights(n)

    # construct the return series
    l <- length(v)
    r <- rep(as.numeric(NA), l)    # r will be the vector we return
    if( l < n ) return( r )        # handle short vectors
    m <- as.integer((n-1)/2)
    for( i in 1:l ) {
        if( i <= m ) {
            # asymmetric weights at the front end
            w <- rev(hmaAsymmetricWeights(n, i+m, weights))
            r[i] <- sum(v[1:(i+m)]*w)
        }

        # apply the symmetric weights to the middle of v
        if( i > m && i <= l-m )
            r[i] <- sum(v[(i-m):(i+m)]*weights)

        if ( i > l-m ) {
            # asymmetric weights at the back end
            sz <- l - i + 1 + m
            w <- hmaAsymmetricWeights(n, sz, weights)
            r[i] <- sum(v[(i-m):l]*w)
        }
    }

    return (r)
}

Next we will look at the functions I use for seasonal adjustment. It assumes a multiplicative model; and that there are no missing data items or series breaks. The eleven steps in the decomposition process are set out in the "main function" towards the bottom of the next code block.

# A hugely cut-down algorithm drawing on ideas from the X-11 ARIMA
# seasonal adjustment package and the Australian Bureau of Statistics
# Assume we have a time series where: Original(t) = Trend(t) * Seasonal(t) * Irregular(t)

source("../bin/henderson.R") # for henderson moving average
library(zoo)
library(tseries)
library(forecast)

extend.series <- function(series.ts, extend.by=2)
{
    # extend the series using an SARIMA fit and prediction at each end
    # to backcast and forecast the likely series trajectories
    # (We extend by two periods to avoid HMA endpoint issues)

    # initialise
    log.series.ts <- log(series.ts)
    start.series.c <- start(series.ts)

    # forward prediction
    forward.fit <- NULL
    tryCatch(forward.fit <- auto.arima(log.series.ts, ic='bic'),
        error=function(e) {
            print(e)
        }
    )
    if(class(forward.fit) != 'Arima') return(NULL)
    prediction.log <- NULL
    tryCatch(prediction.log <- predict( forward.fit,
        n.ahead=(frequency(series.ts)*extend.by) ),
        error=function(e) {
            print(e)
        }
    )
    if(class(prediction.log) != 'list') return(NULL)

    # reverse prediction
    backwards.fit <- NULL
    reverse.log.series.ts <- ts( data=rev(as.vector(log.series.ts)),
        frequency=frequency(series.ts) )
    tryCatch(backwards.fit <- auto.arima(reverse.log.series.ts, ic='bic'),
        error=function(e) {
            print(e)
        }
    )
    if(class(backwards.fit) != 'Arima') return(NULL)
    prediction.log.reverse <- NULL
    tryCatch(prediction.log.reverse <- predict( backwards.fit,
        n.ahead=(frequency(series.ts)*extend.by) ),
        error=function(e) {
            print(e)
        }
    )
    if(class(prediction.log.reverse) != 'list') return(NULL)

    # construct the extended series
    extended.vector <- rev(exp(as.vector(prediction.log.reverse$pred)))
    extended.vector <- append(extended.vector, as.vector(series.ts) )
    extended.vector <- append( extended.vector,
        exp(as.vector(prediction.log$pred)) )
    start.series.c[1] <- start.series.c[1] - extend.by
    extended.ts <- ts(data=extended.vector,
        frequency=frequency(series.ts), start=start.series.c)

    # package up everything we know and return it in a named list ...
    return(list(extended=extended.ts, forward=forward.fit, backwards=backwards.fit))
}

adjust.seasonal.components <- function(raw.components.ts)
{
    # multipass filter to ensure the mean for any period is 1

    loop.seasonal <- function(components.v, freq, len, beginning)
    {
        for(start in seq(beginning,len,freq)) {
            if(start + freq - 1 > len)
                break
            end <- start + freq - 1
            multiplier <- freq / sum(components.v[start:end])
            components.v[start:end] <- components.v[start:end] * multiplier
        }
        return(components.v)
    }

    components.v <- as.vector(raw.components.ts)
    freq <- frequency(raw.components.ts)
    len <- length(components.v)

    # standardise full years from month one
    components.v <- loop.seasonal(components.v, freq, len, 1)
    # standardise full years from month seven
    components.v <- loop.seasonal(components.v, freq, len, 1+trunc(freq/2))
    # standardise the last full year
    components.v <- loop.seasonal(components.v, freq, len, len-freq+1)

    return( ts(components.v, start=start(raw.components.ts),
        frequency=frequency(raw.components.ts)) )
}

smooth.seasonal.components <- function(raw.seasonal.ts)
{
    median.filter <- function(vec, span)
    {
        n <- length(vec)
        halfspan <- trunc(span/2)
        ret.vec <- 1:n
        for(i in 1:n)
        {
            a <- max(1, i-halfspan)
            b <- min(n, i+halfspan)
            selection <- na.trim(vec[a:b])
            if(length(selection)==0)
                ret.vec[i] <- NA
            else
                ret.vec[i] <- median(selection)
        }
        return(ret.vec)
    }

    k3x5 <- c(1/15, 2/15, 3/15, 3/15, 3/15, 2/15, 1/15) # from SEASABS

    # convert from ts to zooreg
    seasonal.zr <- zooreg(as.vector(raw.seasonal.ts),
        start=start(raw.seasonal.ts),
        frequency=frequency(raw.seasonal.ts))

    for(period in 1:frequency(seasonal.zr)) {
        # get the column of data for period
        cases <- as.vector(seasonal.zr[cycle(seasonal.zr)==period,,drop=FALSE])

        # smooth the column

        # first - take the median over n periods to remove worst case outliers,
        # but this will lead to a more spikey seasonally adjusted series,
        # and possibly more residual seasonality in the irregular component
        # so, we only do it with longer series ...
        if(length(raw.seasonal.ts) > (4 * frequency(raw.seasonal.ts)))
            cases <- median.filter(vec=cases, span=3)

        # second - use the filter above to further smooth
        cases.smoothed <- filter(cases, filter=k3x5, sides=2)

        # replicate the seasonal terms at the start and end of the series
        # - lost through the 7-term MA smoothing
        base = 4
        if(is.na(cases.smoothed[base]))
            base = 5
        for(i in 1:3)
            cases.smoothed[base-i] <- cases.smoothed[base]  # base-1, base-2, base-3
        n <- length(cases.smoothed)
        if(is.na(cases.smoothed[n-3]))
            n <- n - 1
        for(i in 0:2)
            cases.smoothed[n-i] <- cases.smoothed[n-3]  # n-0, n-1, n-2

        # and put it back again
        seasonal.zr[cycle(seasonal.zr)==period] <- cases.smoothed
    }

    # convert back from zooreg to ts
    smooth.seasonal.ts <- as.ts(seasonal.zr)

    # re-adjust the data so that the 12 month seasonals sum to 12.
    smooth.seasonal.ts <- adjust.seasonal.components(smooth.seasonal.ts)

    return(smooth.seasonal.ts)
}

simple.trend.estimate <- function(series.ts)
{
    # a period moving average to estimate the trend.

    # pick the right filter
    f <- NULL
    freq = frequency(series.ts)
    if(freq == 2) f <- c(1/4, 1/2, 1/4)
    if(freq == 4) f <- c(1/8, 1/4, 1/4, 1/4, 1/8)
    if(freq == 12) f <- c(1/24, 1/12, 1/12, 1/12, 1/12, 1/12,
                          1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/24)

    # and move it over the series
    return( filter(series.ts, filter=f, sides=2) )
}

validate <- function(series.ts, henderson)
{
    # confirm input is a ts object
    if(!is.ts(series.ts))
        stop('Series.ts should be a ts time series object')

    # remove the leading and trailing NA observations
    series.ts <- na.trim(series.ts)

    # confirm the frequency is greater than 1
    freq <- frequency(series.ts)
    if(freq != 2 && freq != 4 && freq != 12) {
        print(freq)
        stop('Series.ts has an unsupported frequency')
    }

    # confirm the henderson term
    primes = c(5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47,
                53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109)
    if(!is.element(henderson, primes))
        stop('Henderson moving average should be prime between 5 and 109')

    # confirm we have sufficient data for a time series analysis
    if(length(series.ts) < 4*freq)
        stop('Series.ts not sufficiently long')

    return(series.ts)
}

# main function ...
get.seasonal <- function(original.ts, henderson=NULL, verbose=FALSE)
{
    # set the smoothing term
    if(is.null(henderson)) {
        freq = frequency(original.ts)
        if(freq == 2) henderson <- 5
        if(freq == 4) henderson <- 5
        if(freq == 12) henderson <- 13
        if(is.null(henderson)) henderson <- 29 # a number plucked from nowhere
    }
    henderson <- henderson[1] # not vectorised

    # validate the series - saves time debugging later
    series.ts <- validate(original.ts, henderson)

    # Step 1 -- first use SARIMA to extend the series a bit
    extended.l <- extend.series(original.ts)

    # Step 2 -- preliminary estimation of the trend using a moving average
    arima.is.ok <- !is.null(extended.l)
    if(arima.is.ok) {
        extended.ts <- extended.l$extended
        prelim.trend.ts <- simple.trend.estimate(extended.ts) # -- note extended.ts
        if(verbose) {
            cat('Forward ARIMA: '); print(extended.l$forward$arma)
            cat('Reverse ARIMA: '); print(extended.l$backwards$arma)
        }
    }
    else {
        # Because the ARIMA failed to find a good fit,
        # we will use a Henderson MA to estimate the trend
        # but this is probably an exercise in silliness.

        # let's smooth with a higher order henderson moving average
        # pick the next next highest prime after double the current value

        thisOrNextHighestPrime <- function(n) {
            n <- as.integer(n[1]) # integer and not vectorised
            if(!is.finite(n)) return(NA)
            if(n <= 2L) return(2L)

            if(n %% 2L == 0) n <- n + 1L # make odd
            isPrime <- function(n) all(n %% 2L:floor(sqrt(n)) != 0) # || n == 2L
            while(!isPrime(n)) n <- n + 2L # skip even
            return(n)
        }

        henderson <- thisOrNextHighestPrime(henderson * 2)

        # we will use straight up henderson moving average for initial trend
        extended.ts <- original.ts
        prelim.trend.ts <- ts( hma(as.vector(original.ts), henderson),
            start=start(original.ts),
            frequency=frequency(original.ts) )
        if(verbose) print('No ARIMA model found')
    }

    # Step 3 -- first estimation of the seasonal.irregular component
    prelim.seasonal.ts <- extended.ts / prelim.trend.ts # -- note use of extended.ts

    # Step 4 -- let's smooth the seasonal . irregular component
    prelim.smooth.seasonal.ts <- smooth.seasonal.components(prelim.seasonal.ts)

    # Step 5 -- Preliminary estimate of the seasonally adjusted data
    # -- note use of extended.ts
    prelim.seasadj.ts <- na.trim(extended.ts / prelim.smooth.seasonal.ts)

    # Step 6 -- A better estimate of the trend
    better.trend.ts <- ts( hma(as.vector(prelim.seasadj.ts), henderson),
        start=start(prelim.seasadj.ts),
        frequency=frequency(prelim.seasadj.ts) )

    # Step 7 -- A final estimate of the seasonal elements
    # -- note use of extended.ts
    better.seasonal.ts <- extended.ts / better.trend.ts
    final.seasonal.ts <- smooth.seasonal.components(better.seasonal.ts)
    final.seasonal.ts <- window(final.seasonal.ts, start=start(original.ts),
        end=end(original.ts))

    # Step 8 -- final estimate of the seasonally adjusted series
    final.seasadj.ts <- original.ts / final.seasonal.ts

    # Step 9 -- final trend
    final.trend.ts <- ts( hma(as.vector(final.seasadj.ts), henderson),
        start=start(final.seasadj.ts),
        frequency=frequency(final.seasadj.ts) )

    # step 10 -- final irregular
    final.irregular.ts <- final.seasadj.ts / final.trend.ts

    # Step 11 -- package up everything we know and
    # return it in a named list ...
    return(list(original=original.ts, trend=final.trend.ts,
        seasonal=final.seasonal.ts,
        irregular=final.irregular.ts, seasadj=final.seasadj.ts))
}

seasonallyAdjust <- function(df, breaks, start, frequency, verbose=TRUE) {
    # Apply seasonal adjustment to selected columns in a data frame
    # returns an augmented data frame, such that:
    # -- seasonally adjusted in new column break.sa
    # -- trend in new column break.trend

    sa.suffix <- '.sa'
    trend.suffix <- '.trend'

    for(j in seq_along(breaks)) {
        if(verbose) { cat('seasonallyAdjust() Verbose: '); print(breaks[[j]]) }

        # - unique labels for the seasonal and trend components
        colName <- paste(breaks[[j]], sep='')
        colName.sa <- paste(colName, sa.suffix, sep='', collapse='')
        colName.trend <- paste(colName, trend.suffix, sep='', collapse='')

        # - seasonally decompose the series
        series <- df[, breaks[j]]
        series.ts <- ts(series, start=start, frequency=frequency)

        # --- code for seasonal adjustment using stl()
        #series.stl <- stl(series.ts, s.window='periodic', s.jump=1, t.jump=1, l.jump=1)
        #series.trend <- as.numeric(series.stl$time.series[, 'trend'])
        #series.sa <- series - as.numeric(series.stl$time.series[, 'seasonal'])

        series.season <- get.seasonal(original.ts=series.ts, verbose=verbose)
        series.trend <- as.numeric(series.season$trend)
        series.sa <- as.numeric(series.season$seasadj)

        # - store it and remember it
        df[, colName.sa] <- series.sa
        df[, colName.trend] <- series.trend
    }
    return(df)
}

No comments:

Post a Comment