24 June 2010 13:25
tonyrogerson
Recency Frequency talk from last nights user group - the SQL to do the Recency/Frequency grouping on Adventureworks - examples of OVER and PARTITION BY, using CTE's, CROSS APPLY
Code below is from the Recency Frequency of {} (RFM and RFI) talk I did at the user group last night, like I said its the first part - it doesn't do the category flow rates between temporal windows.
I'm writing a full day talk on how to do this using Inmon (normalised route) and Kimball (denormalised star/snowflake and dimensional database routes) - if you have any questions, ideas or suggestions then please do email me tonyrogerson@torver.net - I'm very keen on feedback.
USE AdventureWorks2008R2
go
--
-- Self Join and the Window of rows
--
select winstart = case when soh2.SalesOrderID = soh4.SalesOrderID then '***' else '' end,
control_input_SalesOrderID = soh2.SalesOrderID,
control_input_OrderDate = soh2.OrderDate,
lookup_input_SalesOrderID = soh4.SalesOrderID,
lookup_input_OrderDate = soh4.OrderDate,
recency_days = DATEDIFF( day, soh4.OrderDate, soh2.OrderDate )
from Sales.SalesOrderHeader soh2
left outer join Sales.SalesOrderHeader soh4
on soh4.CustomerID = soh2.CustomerID
AND soh4.OrderDate <= soh2.OrderDate
-- AND soh4.SalesOrderID <> soh2.SalesOrderID
where soh2.CustomerID = 11091
order by control_input_SalesOrderID,
lookup_input_SalesOrderID desc
--
-- What about efficiency? The above a good way to do this?
--
select soh2.CustomerID,
curr_order = soh2.OrderDate,
prev_order = soh3.OrderDate,
recency_days = DATEDIFF( day, soh3.OrderDate, soh2.OrderDate )
from Sales.SalesOrderHeader soh2
left outer join Sales.SalesOrderHeader soh3
on soh3.SalesOrderID = ( SELECT MAX( soh4.SalesOrderID )
FROM Sales.SalesOrderHeader soh4
WHERE soh4.CustomerID = soh2.CustomerID
AND soh4.OrderDate <= soh2.OrderDate
AND soh4.SalesOrderID <> soh2.SalesOrderID )
order by soh2.CustomerID, recency_days
go
-- Rob Farley....
select soh.CustomerID,
curr_order = soh.OrderDate,
prev_order = prev.PrevOrderDate,
recency_days = DATEDIFF(day, prev.PrevOrderDate, soh.OrderDate )
from Sales.SalesOrderHeader as soh
cross apply ( select MAX(prev.OrderDate) as PrevOrderDate
from Sales.SalesOrderHeader as prev
where prev.CustomerID = soh.CustomerID
and prev.SalesOrderID <> soh.SalesOrderID
and prev.OrderDate <= soh.OrderDate) prev
order by soh.CustomerID, recency_days
go
-- Itzik Ben Gan....
with soh_rn as
(
select CustomerID,
OrderDate,
n = row_number() over(order by CustomerID, OrderDate, SalesOrderID)
from Sales.SalesOrderHeader
)
select s1.CustomerID,
curr_order = s1.OrderDate,
prev_order = s2.OrderDate,
recency_days = DATEDIFF( day, s2.OrderDate, s1.OrderDate )
from soh_rn as s1
left outer join soh_rn as s2
on s1.CustomerID = s2.CustomerID
and s1.n = s2.n + 1
order by s1.CustomerID, recency_days
--
-- OVER clause
--
select CustomerID,
SalesOrderID,
SubTotal,
SUM( SubTotal ) OVER( PARTITION BY CustomerID ) AS GrandTotal,
( SubTotal /
SUM( SubTotal ) OVER( PARTITION BY CustomerID )) * 100 AS pct_GrandTotal
from Sales.SalesOrderHeader
where CustomerID in ( 11091
, 11176
, 11185
, 11200
, 11223 )
select CustomerID,
SalesOrderID,
SubTotal,
LEFT( CONVERT( Char(8), OrderDate, 112 ), 6 ) AS YYYYMM,
SUM( SubTotal ) OVER( PARTITION BY CustomerID, LEFT( CONVERT( Char(8), OrderDate, 112 ), 6 ) ) AS YYYYMMGrandTotal,
SUM( SubTotal ) OVER( PARTITION BY CustomerID ) AS GrandTotal,
( SubTotal / SUM( SubTotal ) OVER( PARTITION BY CustomerID )) * 100
AS pct_GrandTotal,
( SubTotal / SUM( SubTotal ) OVER( PARTITION BY CustomerID, LEFT( CONVERT( Char(8), OrderDate, 112 ), 6 ) )) * 100
AS pct_YYYYMMGrandTotal
from Sales.SalesOrderHeader
where CustomerID in ( 11091
, 11176
, 11185
, 11200
, 11223 )
--
-- CROSS APPLY
--
select soh.CustomerID,
curr_order = soh.OrderDate,
prev_order = prev.PrevOrderDate,
recency_days = DATEDIFF(day, prev.PrevOrderDate, soh.OrderDate )
from Sales.SalesOrderHeader as soh
cross apply ( select MAX(prev.OrderDate) as PrevOrderDate
from Sales.SalesOrderHeader as prev
where prev.CustomerID = soh.CustomerID
and prev.SalesOrderID <> soh.SalesOrderID
and prev.OrderDate <= soh.OrderDate) prev
order by soh.CustomerID, recency_days
go
--
-- Common Table Expression
--
; with recency ( curr_order, prev_order, recency_days )
as (
select curr_order = soh2.OrderDate,
prev_order = soh3.OrderDate,
recency_days = DATEDIFF( day, soh3.OrderDate, soh2.OrderDate )
from Sales.SalesOrderHeader soh2
left outer join Sales.SalesOrderHeader soh3
on soh3.SalesOrderID = ( SELECT MAX( soh4.SalesOrderID )
FROM Sales.SalesOrderHeader soh4
WHERE soh4.CustomerID = soh2.CustomerID
AND soh4.OrderDate <= soh2.OrderDate
AND soh4.SalesOrderID <> soh2.SalesOrderID )
where soh2.CustomerID = 11091
)
select curr_order,
prev_order,
recency_days,
average_recency = ( select AVG( recency_days )
from recency ),
stddev_recency = ( select STDEVP( recency_days )
from recency ),
dist_within_1std= ( select COUNT(*)
from recency r
cross join ( select stddev_recency = STDEVP( recency_days )
from recency ) as st
where r.recency_days between -stddev_recency and stddev_recency ),
dist_within_2std= ( select COUNT(*)
from recency r
cross join ( select stddev_recency = STDEVP( recency_days )
from recency ) as st
where r.recency_days between -stddev_recency * 2 and stddev_recency * 2 ),
dist_over_2std= ( select COUNT(*)
from recency r
cross join ( select stddev_recency = STDEVP( recency_days )
from recency ) as st
where r.recency_days < -stddev_recency * 2
or r.recency_days > stddev_recency * 2 ),
population_Size= ( select COUNT(*)
from recency )
from recency
order by curr_order
--
-- Pulling the data for the recency frequency analysis
--
; with soh_rn
as (
select CustomerID,
OrderDate,
SubTotal,
rn = row_number() over(order by CustomerID, OrderDate, SalesOrderID)
from Sales.SalesOrderHeader
)
, recency ( CustomerID, curr_order, prev_order, recency_days, SubTotal, rn )
as (
select s1.CustomerID,
curr_order = s1.OrderDate,
prev_order = s2.OrderDate,
recency_days = DATEDIFF( day, s2.OrderDate, s1.OrderDate ),
s1.subTotal,
s1.rn
from soh_rn as s1
left outer join soh_rn as s2
on s1.CustomerID = s2.CustomerID
and s1.rn = s2.rn + 1 )
select c1.*,
median = AVG( cast( case when population_size % 2 = 0 then
case when position_in_dataset between population_size / 2 -- even number - average the middle two
and ( population_size / 2 ) + 1 then c1.recency_days else null end
else case when position_in_dataset = ( population_size / 2 ) + 1 then c1.recency_days else null end -- odd number - take the middle
end as decimal( 28, 5 ) ) )
OVER( PARTITION BY c1.CustomerID )
into #t
from (
select r.CustomerID,
curr_order,
prev_order,
recency_days,
population_size = COUNT(recency_days) OVER( PARTITION BY r.CustomerID ),
position_in_dataset = ROW_NUMBER() OVER( PARTITION BY r.CustomerID ORDER BY recency_days),
average_recency = AVG( cast( recency_days as decimal( 28, 5 ) ) ) OVER( PARTITION BY r.CustomerID ),
stddev_recency = STDEV( recency_days ) OVER( PARTITION BY r.CustomerID ),
SubTotal,
rn
from recency r
) as c1
--
-- Frequency worked out...
--
select *
from #t
where CustomerID = 11176
order by position_in_dataset
order by CustomerID
, curr_order desc
select customerid from #t group by CustomerID order by COUNT(*) desc
--
-- Work out recency by frequency using the most recent order...
-- to current date, we are using the current date of 20080803 because
-- the data is static
--
select recency_days = DATEDIFF( day, curr_order, '20080803' ),
frequency = COALESCE( median, 1 ),
intensity = COUNT( * ),
monetary = SUM( SubTotal )
into #t2
from #t t
where rn = ( SELECT MAX( t2.rn )
FROM #t t2
WHERE t2.CustomerID = t.CustomerID
)
group by DATEDIFF( day, curr_order, '20080803' ),
COALESCE( median, 1 )
order by recency_days,
frequency
--
-- Group by category
--
declare @recency_categories table (
cat_name varchar(50) not null primary key clustered,
recency_from smallint not null,
recency_to smallint not null,
check( recency_from <= recency_to )
)
declare @frequency_categories table (
cat_name varchar(50) not null primary key clustered,
frequency_from smallint not null,
frequency_to smallint not null,
check( frequency_from <= frequency_to )
)
insert @recency_categories ( cat_name, recency_from, recency_to )
values ( 'past 15 days', 0, 15 ),
( '16 - 30 days', 16, 30 ),
( '31 - 45 days', 31, 45 ),
( '46 - 60 days', 46, 60 ),
( '61 - 75 days', 61, 75 ),
( '76 - 90 days', 76, 90 ),
( 'over 90 days', 91, 32767 )
insert @frequency_categories ( cat_name, frequency_from, frequency_to )
values ( 'single', 1, 1 ),
( '2 - 5', 2, 5 ),
( '6 - 10', 6, 10 ),
( 'over 10', 11, 32767 )
select recency_cat = rc.cat_name,
recency_ord = rc.recency_from,
frequency_cat = fq.cat_name,
frequency_ord = fq.frequency_from,
intensity = SUM( t.intensity ),
monetary = SUM( t.monetary )
from #t2 t
inner join @recency_categories rc on t.recency_days between rc.recency_from
and rc.recency_to
inner join @frequency_categories fq on t.frequency between fq.frequency_from
and fq.frequency_to
group by rc.cat_name,
rc.recency_from,
fq.cat_name,
fq.frequency_from
order by recency_ord,
frequency_ord
--
-- Review recency
--
select recency_cat = rc.cat_name,
recency_ord = rc.recency_from,
intensity = SUM( t.intensity ),
monetary = SUM( t.monetary )
from #t2 t
inner join @recency_categories rc on t.recency_days between rc.recency_from
and rc.recency_to
inner join @frequency_categories fq on t.frequency between fq.frequency_from
and fq.frequency_to
group by rc.cat_name,
rc.recency_from
order by recency_ord
--
-- Check
--
select COUNT( DISTINCT CustomerID )
from #t
where curr_order >= dateadd( day, -15, '20080803' )
select SUM( intensity )
from #t2
where recency_days <= 15
select COUNT( distinct customerid )
from Sales.SalesOrderHeader
where OrderDate >= dateadd( day, -15, '20080803' )
select *
from #t2
where recency_days <= 15
Filed under: RFM, RFI