Hi again, I thought I would improve on the previous post by actually putting a decent about of content into the Filetable - this time I used the opensource DMOZ Health document repository which contains 5,880 files inside 220 folders. The files are all html and are pretty small in size. The entire document collection is about 120Mb unzipped and 30Mb zipped. If any one is interested in testing this collection drop me a note and I will upload the dmoz_health repository archive to Skydrive.
This time I included the SET STATISTICS_IO ON data to show how well the engine is performing with this new feature. I will investigate more over the next few weeks and really begin to understand the impact of huge document collections stored in SQL Server 2012 and also some best practices on what, if any are the limitations of this feature.
EXECUTE xp_cmdshell 'md C:\dmoz_health', no_output
GO
SELECT * FROM sys.fulltext_semantic_languages
GO
create database dmoz_health
on primary (name = dmoz_health_File, filename = N'C:\dmoz_health\dmoz_health_File.mdf'),
filegroup SQLStorage
contains filestream
(name = dmoz_health_FS_File, filename = N'C:\dmoz_health\dmoz_health_FS')
with filestream
(non_transacted_access = full, directory_name = N'dmoz_health Articles')
GO
CREATE TABLE dmoz_health..Documents AS FileTable WITH (filetable_directory = N'Document Library')
GO
-- Then Load Office 2010 Filters
USE dmoz_health
GO
EXEC sp_fulltext_service 'load_os_resources',1
EXEC sp_fulltext_service 'restart_all_fdhosts'
CREATE FULLTEXT CATALOG ft AS DEFAULT
GO
CREATE UNIQUE INDEX DocumentsFt ON dmoz_health..Documents(stream_id)
GO
CREATE SEARCH PROPERTY LIST DocumentProperties;
GO
ALTER SEARCH PROPERTY LIST DocumentProperties
ADD 'Title'
WITH ( PROPERTY_SET_GUID = 'F29F85E0-4FF9-1068-AB91-08002B27B3D9', PROPERTY_INT_ID = 2,
PROPERTY_DESCRIPTION = 'System.Title - Title of the item.');
ALTER SEARCH PROPERTY LIST DocumentProperties
ADD 'Author'
WITH ( PROPERTY_SET_GUID = 'F29F85E0-4FF9-1068-AB91-08002B27B3D9', PROPERTY_INT_ID = 4,
PROPERTY_DESCRIPTION = 'System.Author - Author or authors of the item.');
ALTER SEARCH PROPERTY LIST DocumentProperties
ADD 'Tags'
WITH ( PROPERTY_SET_GUID = 'F29F85E0-4FF9-1068-AB91-08002B27B3D9', PROPERTY_INT_ID = 5,
PROPERTY_DESCRIPTION = 'System.Keywords - Keywords (Tags) of the item.');
GO
CREATE FULLTEXT INDEX ON dmoz_health..Documents
(file_stream TYPE COLUMN file_type LANGUAGE 1033 statistical_semantics)
KEY INDEX DocumentsFt ON ft
WITH SEARCH PROPERTY LIST = DocumentProperties
GO
-- Find path of Documents
USE dmoz_health
GO
SELECT FileTableRootPath() AS 'FileTable Root Path'
GO
-- Right-click the first returned row and select Copy.
/*
4. Open a Windows Explorer window, paste the Document Library path to the address bar, and press Enter. Then double-click the Document Library folder.
5. Open another Windows Explorer window, browse to the dmoz_health folder extracted from the dmoz_health.zip file (will upload this to Skydrive if requested).
6. Select all files and folders in the dmoz_health folder, and drag them to the Document Library folder, while pressing Ctrl.
*/
-- \\KIWI\MSSQLSERVER\dmoz_health Articles
-- Check all files are in the table
SELECT stream_id, name, file_type,
(SELECT REPLACE(N'File:\\'+FileTableRootPath()+file_stream.GetFileNamespacePath(), N' ', N'%20')
FROM dmoz_health..Documents T2
WHERE T2.stream_id = T1.stream_id FOR XML PATH(''), TYPE) AS FilePath,
creation_Time, last_write_time, last_access_time, is_directory, is_offline,
is_hidden, is_readonly, is_archive, is_system, is_temporary
FROM dmoz_health..Documents T1
ORDER BY name ASC
GO
-- 6104 rows
/*
Table 'Documents'. Scan count 0, logical reads 36, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'Worktable'. Scan count 2, logical reads 93, physical reads 0, read-ahead reads 0, lob logical reads 24, lob physical reads 0, lob read-ahead reads 0.
Table 'Documents'. Scan count 0, logical reads 36, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
etc...
(6104 row(s) affected)
Table 'Documents'. Scan count 1, logical reads 18699, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
*/
-- Querying table
/*
SELECT column_name FROM table_name
WHERE CONTAINS (PROPERTY (column_name,'property_name'),'')
*/
USE dmoz_health
GO
SELECT name DocumentName, file_stream.GetFileNamespacePath() Path FROM Documents
WHERE CONTAINS(PROPERTY(file_stream, 'Title'), 'migraine OR headache')
GO
-- 55 rows
SELECT name DocumentName, file_stream.GetFileNamespacePath() Path FROM Documents
WHERE CONTAINS(file_stream, 'NEAR(("migraine", "headache"), 5, FALSE)')
GO
-- 60 rows
/*
Table 'Documents'. Scan count 0, logical reads 28, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'Worktable'. Scan count 2, logical reads 69, physical reads 0, read-ahead reads 0, lob logical reads 16, lob physical reads 0, lob read-ahead reads 0.
Table 'Documents'. Scan count 0, logical reads 28, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
etc...
(60 row(s) affected)
Table 'Worktable'. Scan count 0, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'Documents'. Scan count 1, logical reads 387, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'fulltext_index_docidmap_245575913'. Scan count 0, logical reads 132, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
*/
SELECT name, document_key, keyphrase, score
FROM semantickeyphrasetable(Documents, *)
INNER JOIN Documents ON stream_id = document_key
ORDER BY name, score DESC
GO
-- 469852 rows
/*
(469852 row(s) affected)
Table 'fulltext_index_docidmap_245575913'. Scan count 3, logical reads 70, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'Documents'. Scan count 3, logical reads 387, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'Worktable'. Scan count 0, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'ti_245575913'. Scan count 3, logical reads 2391, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
*/
SELECT KeyPhrase, AVG(score) AS score FROM semantickeyphrasetable(Documents, *)
INNER JOIN Documents ON stream_id = document_key
GROUP BY KeyPhrase ORDER BY score DESC
GO
-- 46028 rows
DECLARE @Title as NVARCHAR(1000)
DECLARE @DocID as UNIQUEIDENTIFIER
SET @Title = '996621.htm'
SELECT @DocID = stream_id FROM Documents WHERE name = @Title
SELECT name, document_key, keyphrase, score
FROM semantickeyphrasetable(Documents, *, @DocID)
INNER JOIN Documents ON stream_id = document_key
ORDER BY name, score DESC
GO
-- 100 rows
/*
Table 'Documents'. Scan count 1, logical reads 246, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
(100 row(s) affected)
Table 'ti_245575913'. Scan count 1, logical reads 4, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'fulltext_index_docidmap_245575913'. Scan count 0, logical reads 2, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'Documents'. Scan count 0, logical reads 3, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
*/
DECLARE @Title as NVARCHAR(1000)
DECLARE @DocID as UNIQUEIDENTIFIER
SET @Title = '996578.htm'
SELECT @DocID = stream_id FROM Documents WHERE name = @Title
SELECT @Title AS SourceTitle, name AS MatchedTitle, stream_id, score
FROM semanticsimilaritytable(Documents, *, @DocID)
INNER JOIN Documents ON stream_id = matched_document_key
ORDER BY score DESC
GO
-- 10 rows
/*
Table 'Documents'. Scan count 1, logical reads 246, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
(10 row(s) affected)
Table 'Documents'. Scan count 0, logical reads 30, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'fulltext_index_docidmap_245575913'. Scan count 0, logical reads 22, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'dsi_245575913'. Scan count 1, logical reads 2, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
*/
USE dmoz_health
DECLARE @SourceTitle as NVARCHAR(1000)
DECLARE @MatchedTitle as NVARCHAR(1000)
DECLARE @SourceDocID as UNIQUEIDENTIFIER
DECLARE @MatchedDocID as UNIQUEIDENTIFIER
SET @SourceTitle = '996578.htm'
SET @MatchedTitle = '996603.htm'
SELECT @SourceDocID = stream_id FROM Documents WHERE name = @SourceTitle
SELECT @MatchedDocID = stream_id FROM Documents WHERE name = @MatchedTitle
SELECT @SourceTitle AS SourceTitle, @MatchedTitle AS MatchedTitle, keyphrase, score
FROM SEMANTICSIMILARITYDETAILSTABLE(Documents, file_stream, @SourceDocID, file_stream, @MatchedDocID)
ORDER BY score DESC
GO
-- 24 rows
/*
Table 'Documents'. Scan count 1, logical reads 246, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'Documents'. Scan count 1, logical reads 246, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
(24 row(s) affected)
Table 'ti_245575913'. Scan count 101, logical reads 505, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'fulltext_index_docidmap_245575913'. Scan count 0, logical reads 4, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
*/