# data.table way this stuff feels faster than dplyr but isn't very FP when using := methods
# alternatively, use the .() aka list() feature and create a new table. Still faster than dplyr or plyr
# https://mran.microsoft.com/web/packages/data.table/vignettes/datatable-intro.html
library(data.table) # for fread and other data.table functions
library(tidyverse)  # for as_tibble to feed into ggplot
library(lubridate)  # for round_date
library(fasttime)   # for fastPOSIXct
 
# SQL for dtCLE01=fread("c:/kewoo/eai/CLE.Identity.d20171204.csv")
# -- iQ420349 Confirm Thuy's hypothesis
# SELECT    [transactiontype]
# ,[transactionid]
# ,CASE when COUNT(time_stamp) = 1 then 1 else 0 END one_is_timeout
# ,CASE when COUNT(time_stamp) = 1
# then 0.000755
# else DATEDIFF(MS, min(time_stamp), max([TIME_STAMP]))
# END duration_ms
# ,min([TIME_STAMP]) start
# -- ,min([transactiondata]) min_td
# ,max([time_stamp]) endt
# -- ,max([transactiondata])
# ,[componentname]
# ,correlationid
# -- use "with (nolock)" to prevent table locking
# FROM [HAWK_Log_Archive].[dbo].[PR_LOG] with (nolock)
# -- Refer to timestamp format for time-level granularity 
# where transactionid in (  SELECT [transactionid]    
# FROM [HAWK_Log_Archive].[dbo].[PR_LOG] with (nolock)
# where TIME_STAMP  >= '20171204 20:00:00:00' and TIME_STAMP < '20171204 23:35:00:00'
# and componentname like 'Identity%'
# )
# and status in ('Start','End')
# group by transactionid,transactiontype,applicationid,componentname,correlationid
# order by start
 
dtCLE01=fread("c:/kewoo/eai/CLE.Identity.d20171204.csv")
AESTDiff <- 36000
interval.length <- "1 seconds"
 
# exploratory
str(dtCLE01)
nrow(dtCLE01)
names(dtCLE01)
dtCLE01[,.(TIME_STAMP, APPLICATIONID)]
# end exploration
 
tb01.tx.times.all <-dtCLE01[, list(transactionid,
                                componentname,
                                startPct = round_date(fastPOSIXct(start)-AESTDiff, interval.length),
                                endtPct = round_date(fastPOSIXct(endt)-AESTDiff, interval.length))
                         ]
tb01.expandedIntervals <- tb01.tx.times.all[, list(intervals = seq(startPct, endtPct, by=1)), by = transactionid
                             ][, list(txCount = .N), by = intervals]
ggplot() +
  geom_line(data=tb01.expandedIntervals, aes(x=intervals,y=txCount), color='blue')
 
 
 
 
 
 
# FOR FILTERED TIME RANGE
start.AEST <- fastPOSIXct("2017-12-04 21:17:00")-36000
end.AEST <- fastPOSIXct("2017-12-04 21:32:00")-36000
tb01.tx.times.filtered <- tb01.tx.times.all[startPct > start.AEST & endtPct < end.AEST]
tb01.expandedIntervals <- tb01.tx.times.filtered[, list(intervals = seq(startPct, endtPct, by=1)), by = transactionid
                                            ][, list(txCount = .N), by = intervals]
ggplot() +
  geom_line(data=tb01.expandedIntervals, aes(x=intervals,y=txCount), color='blue')
 
 
 
 
 
 
# SQL for dtER=fread("c:/kewoo/eai/EXCEPTIONREC.identity.d20171204.csv")
# -- iQ420349 Confirm Thuy's hypothesis
#
#
#
# -- GENERIC EXCEPTIONREC SQL
# SELECT [TIME_STAMP]
# ,[COMPONENTNAME]
# ,[TRANSACTIONTYPE]
# ,[STATUS]
# ,[transactionid]
# ,[correlationid]
# ,[transactiondata]
# ,[stacktrace]
# ,[message]
# ,[custom] 
# -- use "with (nolock)" to prevent table locking
# FROM [HAWK_Log_Archive].[dbo].[PR_EXCEPTIONREC] with (nolock)
# -- Refer to timestamp format for time-level granularity 
# where TIME_STAMP  >= '20171204 20:00:00:00' and TIME_STAMP < '20171204 23:35:00:00'
# and componentname like 'Identity%'
# -- and transactionid = '9f2c6007-9c29-4d7f-8244-386c80881990'
# order by TIME_STAMP
 
 
 
dtER=fread("c:/kewoo/eai/EXCEPTIONREC.identity.d20171204.csv")
tb02.tx.times.all <-dtER[, list(transactionid,
                                   COMPONENTNAME,
                                   endtPct = round_date(fastPOSIXct(TIME_STAMP)-AESTDiff, interval.length))]
tb02.txCounts <- tb02.tx.times.all[, list(txCount = .N), by = endtPct]
 
ggplot() +
  geom_line(data=tb01.expandedIntervals, aes(x=intervals,y=txCount), color='blue') +
  geom_line(data=tb02.txCounts, aes(x=endtPct,y=txCount), color='red')
 
 
# 20171215: The reason there's a drop in txCount during a service interruption
#           is because startPct == endPct caused by the group by in the original extracting SQL
#           Solution is to extract actual endPct from EXCEPTIONREC joining via transactionid
# first, take outer join
dtOJ <- tb02.tx.times.all[tb01.tx.times.all, on = "transactionid"]
# second, populate blank (NA) startPct values with i.startPct
dtOJ[is.na(endtPct), endtPct := i.endtPct]
dtOJ.expandedIntervals <- dtOJ[, list(intervals = seq(startPct, endtPct, by=1)), by = transactionid
                                            ][, list(txCount = .N), by = intervals]
ggplot() +
  geom_line(data=dtOJ.expandedIntervals, aes(x=intervals,y=txCount), color='blue') +
  geom_line(data=tb02.txCounts, aes(x=endtPct,y=txCount), color='red')