SQL Server ‐ Data Masking - shaysalomon12/Data-Engineer GitHub Wiki
View to generate Random Values
CREATE VIEW dbo.random(value) AS SELECT RAND();
GO
Randomization Function for Text Columns
ALTER FUNCTION dbo.fnRandomizedText (
@OldValue AS VARCHAR(MAX)
)RETURNS VARCHAR(MAX)
BEGIN
DECLARE @NewValue AS VARCHAR(MAX)
DECLARE @nCount AS INT
DECLARE @cCurrent AS CHAR(1)
DECLARE @cScrambled AS CHAR(1)
DECLARE @Random AS REAL
SET @NewValue = ''
SET @nCount = 0
WHILE (@nCount <= LEN(@OldValue))
BEGIN
SELECT @Random = value FROM random
SET @cCurrent = SUBSTRING(@OldValue, @nCount, 1)
-- English lowercase
IF ASCII(@cCurrent) BETWEEN ASCII('a') AND ASCII('z')
SET @cScrambled = CHAR(ROUND(((ASCII('z') - ASCII('a') - 1) * @Random + ASCII('a')), 0))
-- English uppercase
ELSE IF ASCII(@cCurrent) BETWEEN ASCII('A') AND ASCII('Z')
SET @cScrambled = CHAR(ROUND(((ASCII('Z') - ASCII('A') - 1) * @Random + ASCII('A')), 0))
-- Hebrew
ELSE IF ASCII(@cCurrent) BETWEEN ASCII('א') AND ASCII('ת')
SET @cScrambled = CHAR(ROUND(((ASCII('ת') - ASCII('א') - 1) * @Random + ASCII('א')), 0))
-- Numbers
ELSE IF ASCII(@cCurrent) BETWEEN ASCII('0') AND ASCII('9')
SET @cScrambled = CHAR(ROUND(((ASCII('9') - ASCII('0') - 1) * @Random + ASCII('0')), 0))
-- Arabic
ELSE IF UNICODE(@cCurrent) BETWEEN 1548 AND 1746
SET @cScrambled = CHAR(ROUND(((1746 - 1548 - 1) * @Random + 1548), 0))
ELSE
SET @cScrambled = @cCurrent
SET @NewValue = @NewValue + @cScrambled
SET @nCount = @nCount + 1
END
RETURN LTRIM(RTRIM(@NewValue))
END
GO
Randomization Function for Date Columns (not relevant)
ALTER FUNCTION dbo.fnRandomizedText_Date (
@OldValue AS VARCHAR(MAX)
)RETURNS VARCHAR(MAX)
BEGIN
DECLARE @NewValue AS VARCHAR(MAX)
DECLARE @nCount AS INT
DECLARE @cCurrent AS CHAR(1)
DECLARE @cScrambled AS CHAR(1)
DECLARE @Random AS REAL
SET @NewValue = ''
SET @nCount = 0
WHILE (@nCount <= LEN(@OldValue))
BEGIN
SELECT @Random = value FROM random
SET @cCurrent = SUBSTRING(@OldValue, @nCount, 1)
IF ASCII(@cCurrent) BETWEEN ASCII('a') AND ASCII('z')
SET @cScrambled = CHAR(ROUND(((ASCII('z') - ASCII('a') - 1) * @Random + ASCII('a')), 0))
ELSE IF ASCII(@cCurrent) BETWEEN ASCII('A') AND ASCII('Z')
SET @cScrambled = CHAR(ROUND(((ASCII('Z') - ASCII('A') - 1) * @Random + ASCII('A')), 0))
ELSE IF ASCII(@cCurrent) BETWEEN ASCII('א') AND ASCII('ת')
SET @cScrambled = CHAR(ROUND(((ASCII('ת') - ASCII('א') - 1) * @Random + ASCII('א')), 0))
ELSE IF ASCII(@cCurrent) BETWEEN ASCII('0') AND ASCII('9')
SET @cScrambled = CHAR(ROUND(((ASCII('9') - ASCII('0') - 1) * @Random + ASCII('0')), 0))
ELSE
SET @cScrambled = @cCurrent
SET @NewValue = @NewValue + @cScrambled
SET @nCount = @nCount + 1
END
RETURN LTRIM(RTRIM(@NewValue))
END
GO
Examples for using the Masking function
select dbo.fnRandomizedText('[email protected]');
Output: [email protected]
- Private + Last names
select dbo.fnRandomizedText('Arik Kuperman');
Output: Twgo Sqntkgwf
- Private + Last names Hebrew
select dbo.fnRandomizedText('אריק ברמן');
Output: -- םפטח צןמב
- Id
select dbo.fnRandomizedText(024511278);
Output: 51568241
- Visa card
select dbo.fnRandomizedText('4580-3211-5210-3120');
Output: 5666-6672-5634-8447
Some Columns in dm_inventory Table:
- table_name - table in a database with schema name
- column_id - number of column in a database
- column_name - column name
- relationship - symbol ('>-') indicating foreign key and direction
- primary_table - referenced table
- pk_column_name - referenced column
- no - column id in a key constraint
- fk_constraint_name - foreign key constraint name
Data Masking steps
1. Identify tables & columns that are candidates for data masking
Populate the Personally Identifiable Information (PII) table DBA.dbo.dm_inventory We use [cetgroups3_dm], the DB you want to run Data Masking and run the following
USE [cetgroups3_dm]
GO
DELETE from DBA.dbo.dm_inventory WHERE db_name = db_name();
WITH pk AS (
select schema_name(tab.schema_id) as [schema_name],
pk.[name] as pk_name,
ic.index_column_id as column_id,
col.[name] as column_name,
tab.[name] as table_name
from sys.tables tab
inner join sys.indexes pk ON tab.object_id = pk.object_id AND pk.is_primary_key = 1
inner join sys.index_columns ic ON ic.object_id = pk.object_id AND ic.index_id = pk.index_id
inner join sys.columns col ON pk.object_id = col.object_id AND col.column_id = ic.column_id
)
INSERT into DBA.dbo.dm_inventory
select
db_name() as db_name,
tab.object_id,
schema_name(tab.schema_id) + '.' + tab.name as [table_name],
col1.column_id,
col1.name as column_name,
t.name as data_type,
case when com.definition is not null then 1 else null end as is_computed,
col.is_identity,
case when fk.object_id is not null then '>-' else null end as relationship,
schema_name(pk_tab.schema_id) + '.' + pk_tab.name as primary_table,
pk_col.name as pk_column_name,
fk_cols.constraint_column_id as no,
fk.name as fk_constraint_name,
pk.pk_name as table_pk_name,
1 as enable_data_masking
from sys.tables tab
inner join sys.columns col ON col.object_id = tab.object_id
join sys.columns col1 ON col1.object_id = col.object_id
left outer join sys.foreign_key_columns fk_cols ON fk_cols.parent_object_id = tab.object_id AND fk_cols.parent_column_id = col1.column_id
left outer join sys.foreign_keys fk ON fk.object_id = fk_cols.constraint_object_id
left outer join sys.tables pk_tab ON pk_tab.object_id = fk_cols.referenced_object_id
left outer join sys.columns pk_col ON pk_col.column_id = fk_cols.referenced_column_id AND pk_col.object_id = fk_cols.referenced_object_id
left outer join pk ON pk.schema_name = schema_name(tab.schema_id) AND pk.table_name = tab.name AND pk.column_name = col1.name
left outer join sys.types t ON col1.user_type_id = t.user_type_id
left outer join sys.computed_columns com ON com.object_id = tab.object_id AND com.name = col1.name
where
col.name like '%gUserID%'
AND
(col1.name like '%name%'
or (col1.name like '%user%' and col1.name != 'gUserID')
or col1.name like '%mail%'
or col1.name like '%addr%'
or col1.name like '%TZ%'
or col1.name like '%password%'
or col1.name like '%city%'
or col1.name like '%phone%'
or col1.name like '%home%'
or col1.name like '%fax%'
or col1.name like '%birth%')
AND t.name NOT IN ('uniqueidentifier', 'decimal', 'bit','float')
AND col1.is_identity = 0
order by schema_name(tab.schema_id) + '.' + tab.name, col.[name]
;
- List the PII inventory table
select * from DBA.dbo.dm_inventory order by 3,4;
2. Clone cetgroups3 database
BACKUP DATABASE cetgroups3 TO DISK = 'F:\Backup\cetgroups3_20230705.bak' WITH STATS=10, COMPRESSION;
RESTORE DATABASE cetgroups3_dm FROM DISK='F:\Backup\cetgroups3_20230705.bak'
WITH
MOVE 'CetGroups3' TO 'D:\SqlData\CetGroups3_dm.mdf',
MOVE 'CetGroups3_log' TO 'F:\SqlLog\CetGroups3_dm_log.ldf';
3. Create the data masking SP from database DBA
USE [DBA]
GO
ALTER PROCEDURE [dbo].[sp_data_masking] (
@DB_Name NVARCHAR(128)
)
AS
BEGIN
DECLARE
-- @db_name nvarchar(128) = 'cetgroups3_dm',
@table_name nvarchar(128),
@column_name nvarchar(128),
@data_type nvarchar(128),
@enable_data_masking int,
@is_identity int,
@is_computed int,
@sql_cmd nvarchar(max),
@error_line nvarchar(max) = CAST(ERROR_LINE() AS nvarchar(100)),
@error_message nvarchar(max) = CAST(ERROR_MESSAGE() AS nvarchar(100)),
@text nvarchar(max),
@start_datetime datetime = sysdatetime(),
@counter int = 1,
@daysRand int,
@cur CURSOR;
IF OBJECT_ID('dm_log') IS NOT NULL DROP TABLE dm_log
CREATE TABLE dm_log ( time_date datetime, text nvarchar(max))
IF OBJECT_ID('tempdb..#gQA_UserID') IS NOT NULL DROP TABLE #gQA_UserID
CREATE TABLE #gQA_UserID (gUserID uniqueidentifier, nSchoolSign int)
SET XACT_ABORT, NOCOUNT ON;
BEGIN TRY
INSERT INTO dm_log VALUES (getdate(), '*** Started Data Masking for database ' + @DB_Name + ' ***')
INSERT INTO dm_log VALUES (getdate(), 'Building QA users')
-- Populate QA users table by Semel Mosad (sSchoolSign). These users will not be Data Masked since they are not real users
INSERT INTO #gQA_UserID
select u.gUserID, s.sSchoolSign
from cetgroups3..Users u
join cetgroups3..GroupUsers gu on gu.gUserId=u.gUserId
join cetgroups3..groups g on gu.gGroupId=g.gGroupId
join cetgroups3..Schools s on g.schoolId=s.nSchoolId
where s.sSchoolSign in ('123123','321321','3333333','141415','TEST-003','TEST-004','TEST-005','TEST-006','TEST-007','QA-666666','353534')
union
select
u.gUserID, s.sSchoolSign
from cetgroups3..Users u
join cetgroups3..GroupUsers gu on gu.gUserId=u.gUserId
join cetgroups3..Groups_hierarchy h on h.GroupId=gu.gGroupId
join cetgroups3..groups g on h.ParentGroupId=g.gGroupId
join cetgroups3..Schools s on g.schoolId=s.nSchoolId
where s.sSchoolSign in ('123123','321321','3333333','141415','TEST-003','TEST-004','TEST-005','TEST-006','TEST-007','QA-666666','353534')
SET @cur = CURSOR FOR
select table_name, column_name, data_type, is_identity, is_computed, enable_data_masking
from dm_inventory
where db_name = @DB_Name AND enable_data_masking = 1 AND is_computed is NULL
order by 1,2
OPEN @cur
FETCH NEXT FROM @cur INTO @table_name, @column_name, @data_type, @is_identity, @is_computed, @enable_data_masking
PRINT ('Open cursor')
WHILE @@FETCH_STATUS = 0 AND @counter <= 999
BEGIN
BEGIN TRANSACTION
IF @data_type in ('date', 'datetime', 'datetime2', 'time', 'timestamp','smalldatetime')
BEGIN
SET @daysRand = FLOOR(RAND()*(1000-5+1)+5) -- Random number between 5 and 1000
SET @sql_cmd = 'update ' + @DB_NAME + '.' + @table_name + ' set [' + @column_name + '] = datediff(DAY, -' + CAST(@DaysRand as varchar) + ', ' + @column_name + ')'
END
ELSE
BEGIN
SET @sql_cmd = 'update ' + @DB_NAME + '.' + @table_name + ' set [' + @column_name + '] = dbo.fnRandomizedText(' + @column_name+ ') where gUserID not in (select gUserID from #gQA_UserID)'
END
-- PRINT @sql_cmd
EXEC (@sql_cmd)
SET @text = 'Completed running: ' + @sql_cmd
SET @counter = @counter+1
INSERT INTO dm_log VALUES (getdate(), @text)
UPDATE dm_inventory SET @enable_data_masking = 0 WHERE db_name = @DB_Name AND table_name = @table_name AND column_name = @column_name
COMMIT
FETCH NEXT FROM @cur INTO @table_name, @column_name, @data_type, @is_identity, @is_computed, @enable_data_masking
END
CLOSE @cur;
DEALLOCATE @cur;
INSERT INTO dm_log VALUES (getdate(), '*** Finished Data Masking for database ' + @DB_Name + ' ***')
END TRY
BEGIN CATCH
IF @@TRANCOUNT > 0 ROLLBACK;
-- SELECT
-- ERROR_NUMBER() AS ErrorNumber
-- ,ERROR_SEVERITY() AS ErrorSeverity
-- ,ERROR_STATE() AS ErrorState
-- ,ERROR_PROCEDURE() AS ErrorProcedure
-- ,ERROR_LINE() AS ErrorLine
SET @error_message = ERROR_MESSAGE()
SET @text = 'Failed to mask table: ' + @DB_NAME + '.' + @table_name + ', Column: ' + @column_name + '. ' + @error_message
INSERT INTO dm_log VALUES (getdate(), @text)
PRINT(@text)
CLOSE @cur;
DEALLOCATE @cur;
END CATCH
END
GO
4. Run data masking for specified database ('cetgroups3_dm')
USE [cetgroups3_dm]
GO
EXEC sp_data_masking 'cetgroups3_dm';
5. Monitor Data Masking Progress Status
SELECT
ja.job_id,
ja.session_id,
j.name AS job_name,
ja.start_execution_date,
ISNULL(last_executed_step_id,0)+1 AS current_executed_step_id,
Js.step_name,
DATEDIFF(MINUTE, ja.start_execution_date, GETDATE()) AS Duration_min
FROM msdb.dbo.sysjobactivity ja
LEFT JOIN msdb.dbo.sysjobhistory jh ON ja.job_history_id = jh.instance_id
JOIN msdb.dbo.sysjobs j ON ja.job_id = j.job_id
JOIN msdb.dbo.sysjobsteps js
ON ja.job_id = js.job_id
AND ISNULL(ja.last_executed_step_id,0)+1 = js.step_id
WHERE
ja.session_id = (
SELECT TOP 1 session_id FROM msdb.dbo.syssessions ORDER BY agent_start_date DESC
)
-- AND j.name = @Job_Name
AND start_execution_date is not null
AND stop_execution_date is null;
select count(*) from dm_log;
select * from dm_log order by time_date desc;