SQL Server ‐ Data Masking - shaysalomon12/Data-Engineer GitHub Wiki

View to generate Random Values

CREATE VIEW dbo.random(value) AS SELECT RAND();
GO

Randomization Function for Text Columns

ALTER FUNCTION dbo.fnRandomizedText (
@OldValue AS VARCHAR(MAX)
)RETURNS VARCHAR(MAX)

BEGIN

  DECLARE @NewValue AS VARCHAR(MAX)
  DECLARE @nCount AS INT
  DECLARE @cCurrent AS CHAR(1)
  DECLARE @cScrambled AS CHAR(1)
  DECLARE @Random AS REAL
 
  SET @NewValue = ''
  SET @nCount = 0
   WHILE (@nCount <= LEN(@OldValue))
  BEGIN
    SELECT @Random = value FROM random
    SET @cCurrent = SUBSTRING(@OldValue, @nCount, 1)
    
	-- English lowercase
	IF ASCII(@cCurrent) BETWEEN ASCII('a') AND ASCII('z')
       SET @cScrambled = CHAR(ROUND(((ASCII('z') - ASCII('a') - 1) * @Random + ASCII('a')), 0))
    
	-- English uppercase
	ELSE IF ASCII(@cCurrent) BETWEEN ASCII('A') AND ASCII('Z')
       SET @cScrambled = CHAR(ROUND(((ASCII('Z') - ASCII('A') - 1) * @Random + ASCII('A')), 0))
    
	-- Hebrew
	ELSE IF ASCII(@cCurrent) BETWEEN ASCII('א') AND ASCII('ת')
       SET @cScrambled = CHAR(ROUND(((ASCII('ת') - ASCII('א') - 1) * @Random + ASCII('א')), 0))
	
	-- Numbers
	ELSE IF ASCII(@cCurrent) BETWEEN ASCII('0') AND ASCII('9')
       SET @cScrambled = CHAR(ROUND(((ASCII('9') - ASCII('0') - 1) * @Random + ASCII('0')), 0))

	-- Arabic
	ELSE IF UNICODE(@cCurrent) BETWEEN 1548 AND 1746
		SET @cScrambled = CHAR(ROUND(((1746 - 1548 - 1) * @Random + 1548), 0))
	
    ELSE
       SET @cScrambled = @cCurrent

    SET @NewValue = @NewValue + @cScrambled
    SET @nCount = @nCount + 1

  END
   RETURN LTRIM(RTRIM(@NewValue))
END
GO 

Randomization Function for Date Columns (not relevant)

ALTER FUNCTION dbo.fnRandomizedText_Date (
@OldValue AS VARCHAR(MAX)
)RETURNS VARCHAR(MAX)

BEGIN

  DECLARE @NewValue AS VARCHAR(MAX)
  DECLARE @nCount AS INT
  DECLARE @cCurrent AS CHAR(1)
  DECLARE @cScrambled AS CHAR(1)
  DECLARE @Random AS REAL
 
  SET @NewValue = ''
  SET @nCount = 0
   WHILE (@nCount <= LEN(@OldValue))
  BEGIN
    SELECT @Random = value FROM random
    SET @cCurrent = SUBSTRING(@OldValue, @nCount, 1)
    
	IF ASCII(@cCurrent) BETWEEN ASCII('a') AND ASCII('z')
       SET @cScrambled = CHAR(ROUND(((ASCII('z') - ASCII('a') - 1) * @Random + ASCII('a')), 0))
    ELSE IF ASCII(@cCurrent) BETWEEN ASCII('A') AND ASCII('Z')
       SET @cScrambled = CHAR(ROUND(((ASCII('Z') - ASCII('A') - 1) * @Random + ASCII('A')), 0))
    
	ELSE IF ASCII(@cCurrent) BETWEEN ASCII('א') AND ASCII('ת')
       SET @cScrambled = CHAR(ROUND(((ASCII('ת') - ASCII('א') - 1) * @Random + ASCII('א')), 0))
	
	ELSE IF ASCII(@cCurrent) BETWEEN ASCII('0') AND ASCII('9')
       SET @cScrambled = CHAR(ROUND(((ASCII('9') - ASCII('0') - 1) * @Random + ASCII('0')), 0))
    ELSE
       SET @cScrambled = @cCurrent

    SET @NewValue = @NewValue + @cScrambled
    SET @nCount = @nCount + 1

  END
   RETURN LTRIM(RTRIM(@NewValue))
END
GO 

Examples for using the Masking function

  • Email
select dbo.fnRandomizedText('[email protected]');		

Output: [email protected]

  • Private + Last names
select dbo.fnRandomizedText('Arik Kuperman');				

Output: Twgo Sqntkgwf

  • Private + Last names Hebrew
select dbo.fnRandomizedText('אריק ברמן');					

Output: -- םפטח צןמב

  • Id
select dbo.fnRandomizedText(024511278);						

Output: 51568241

  • Visa card
select dbo.fnRandomizedText('4580-3211-5210-3120');			

Output: 5666-6672-5634-8447

Some Columns in dm_inventory Table:

  • table_name - table in a database with schema name
  • column_id - number of column in a database
  • column_name - column name
  • relationship - symbol ('>-') indicating foreign key and direction
  • primary_table - referenced table
  • pk_column_name - referenced column
  • no - column id in a key constraint
  • fk_constraint_name - foreign key constraint name

Data Masking steps

1. Identify tables & columns that are candidates for data masking

Populate the Personally Identifiable Information (PII) table DBA.dbo.dm_inventory We use [cetgroups3_dm], the DB you want to run Data Masking and run the following

USE [cetgroups3_dm]
GO

DELETE from DBA.dbo.dm_inventory WHERE db_name = db_name();

WITH pk AS (
select schema_name(tab.schema_id) as [schema_name], 
    pk.[name] as pk_name,
    ic.index_column_id as column_id,
    col.[name] as column_name, 
    tab.[name] as table_name
from sys.tables tab
    inner join sys.indexes pk ON tab.object_id = pk.object_id AND pk.is_primary_key = 1
    inner join sys.index_columns ic ON ic.object_id = pk.object_id AND ic.index_id = pk.index_id
    inner join sys.columns col ON pk.object_id = col.object_id AND col.column_id = ic.column_id
)
INSERT into DBA.dbo.dm_inventory
select 
	db_name() as db_name,
	tab.object_id,
	schema_name(tab.schema_id) + '.' + tab.name as [table_name],
    col1.column_id,
    col1.name as column_name,
	t.name as data_type,
	case when com.definition is not null then 1 else null end as is_computed,
	col.is_identity,
    case when fk.object_id is not null then '>-' else null end as relationship,
    schema_name(pk_tab.schema_id) + '.' + pk_tab.name as primary_table,
    pk_col.name as pk_column_name,
    fk_cols.constraint_column_id as no,
    fk.name as fk_constraint_name,
	pk.pk_name as table_pk_name,
	1 as enable_data_masking
from sys.tables tab
    inner join sys.columns col ON col.object_id = tab.object_id
	join sys.columns col1 ON col1.object_id = col.object_id
    left outer join sys.foreign_key_columns fk_cols ON fk_cols.parent_object_id = tab.object_id AND fk_cols.parent_column_id = col1.column_id 
    left outer join sys.foreign_keys fk ON fk.object_id = fk_cols.constraint_object_id
    left outer join sys.tables pk_tab ON pk_tab.object_id = fk_cols.referenced_object_id
    left outer join sys.columns pk_col ON pk_col.column_id = fk_cols.referenced_column_id AND pk_col.object_id = fk_cols.referenced_object_id
	left outer join pk ON pk.schema_name = schema_name(tab.schema_id) AND pk.table_name = tab.name AND pk.column_name = col1.name
	left outer join sys.types t ON col1.user_type_id = t.user_type_id
	left outer join sys.computed_columns com ON com.object_id = tab.object_id AND com.name = col1.name
where 
    col.name like '%gUserID%'
    AND
	(col1.name like '%name%'
	or (col1.name like '%user%' and col1.name != 'gUserID')
	or col1.name like '%mail%'
	or col1.name like '%addr%'
	or col1.name like '%TZ%'
	or col1.name like '%password%'
	or col1.name like '%city%'
	or col1.name like '%phone%'
	or col1.name like '%home%'
	or col1.name like '%fax%'
	or col1.name like '%birth%')
	AND t.name NOT IN ('uniqueidentifier', 'decimal', 'bit','float') 
    AND col1.is_identity = 0
order by schema_name(tab.schema_id) + '.' + tab.name, col.[name]
;
  • List the PII inventory table
select * from DBA.dbo.dm_inventory order by 3,4;

2. Clone cetgroups3 database

BACKUP DATABASE cetgroups3 TO DISK = 'F:\Backup\cetgroups3_20230705.bak' WITH STATS=10, COMPRESSION;

RESTORE DATABASE cetgroups3_dm FROM DISK='F:\Backup\cetgroups3_20230705.bak'
WITH 
   MOVE 'CetGroups3' TO 'D:\SqlData\CetGroups3_dm.mdf',
   MOVE 'CetGroups3_log' TO 'F:\SqlLog\CetGroups3_dm_log.ldf';

3. Create the data masking SP from database DBA

USE [DBA]
GO

ALTER PROCEDURE [dbo].[sp_data_masking] (
	@DB_Name			NVARCHAR(128)
	)
AS
BEGIN
	DECLARE 
	-- @db_name nvarchar(128) = 'cetgroups3_dm',
	@table_name nvarchar(128),
	@column_name nvarchar(128),
	@data_type nvarchar(128),
	@enable_data_masking int,
	@is_identity int,
	@is_computed int,
	@sql_cmd nvarchar(max),
	@error_line nvarchar(max) = CAST(ERROR_LINE() AS nvarchar(100)),
	@error_message nvarchar(max) = CAST(ERROR_MESSAGE() AS nvarchar(100)),
	@text nvarchar(max),
	@start_datetime	datetime = sysdatetime(),
	@counter int = 1,
	@daysRand int,
	@cur CURSOR;

	IF OBJECT_ID('dm_log') IS NOT NULL DROP TABLE dm_log
	CREATE TABLE dm_log ( time_date datetime, text nvarchar(max))

    IF OBJECT_ID('tempdb..#gQA_UserID') IS NOT NULL DROP TABLE #gQA_UserID
    CREATE TABLE #gQA_UserID (gUserID uniqueidentifier, nSchoolSign int)

	SET XACT_ABORT, NOCOUNT ON;

	BEGIN TRY

        INSERT INTO dm_log VALUES (getdate(), '*** Started Data Masking for database ' + @DB_Name  + ' ***')

        INSERT INTO dm_log VALUES (getdate(), 'Building QA users')

        -- Populate QA users table by Semel Mosad (sSchoolSign). These users will not be Data Masked since they are not real users
        INSERT INTO #gQA_UserID
            select u.gUserID, s.sSchoolSign 
            from cetgroups3..Users u
            join cetgroups3..GroupUsers gu on gu.gUserId=u.gUserId
            join cetgroups3..groups g on gu.gGroupId=g.gGroupId
            join cetgroups3..Schools s on g.schoolId=s.nSchoolId
            where s.sSchoolSign in ('123123','321321','3333333','141415','TEST-003','TEST-004','TEST-005','TEST-006','TEST-007','QA-666666','353534')
			union
            select 
            u.gUserID, s.sSchoolSign 
            from cetgroups3..Users u
            join cetgroups3..GroupUsers gu on gu.gUserId=u.gUserId
            join cetgroups3..Groups_hierarchy h on h.GroupId=gu.gGroupId
            join cetgroups3..groups g on h.ParentGroupId=g.gGroupId
            join cetgroups3..Schools s on g.schoolId=s.nSchoolId
            where s.sSchoolSign in ('123123','321321','3333333','141415','TEST-003','TEST-004','TEST-005','TEST-006','TEST-007','QA-666666','353534')

		SET @cur = CURSOR FOR
			select table_name, column_name, data_type, is_identity, is_computed, enable_data_masking 
			from dm_inventory 
			where db_name = @DB_Name AND enable_data_masking = 1 AND is_computed is NULL
			order by 1,2

		OPEN @cur 
		FETCH NEXT FROM @cur INTO @table_name, @column_name, @data_type, @is_identity, @is_computed, @enable_data_masking 

		PRINT ('Open cursor')

		WHILE @@FETCH_STATUS = 0 AND @counter <= 999

		BEGIN
		
			BEGIN TRANSACTION
				
			IF @data_type in ('date', 'datetime', 'datetime2', 'time', 'timestamp','smalldatetime')
			BEGIN
				SET @daysRand = FLOOR(RAND()*(1000-5+1)+5)	-- Random number between 5 and 1000
				SET @sql_cmd = 'update ' + @DB_NAME + '.' + @table_name + ' set [' + @column_name + '] = datediff(DAY, -' + CAST(@DaysRand as varchar) + ', ' + @column_name + ')'
			END
			ELSE
			BEGIN
				SET @sql_cmd = 'update ' + @DB_NAME + '.' + @table_name + ' set [' + @column_name + '] = dbo.fnRandomizedText(' + @column_name+ ') where gUserID not in (select gUserID from #gQA_UserID)'
			END
				
			-- PRINT @sql_cmd
			EXEC (@sql_cmd)
			SET @text = 'Completed running: ' + @sql_cmd
			SET @counter = @counter+1

			INSERT INTO dm_log VALUES (getdate(), @text)
			UPDATE dm_inventory SET @enable_data_masking = 0 WHERE db_name = @DB_Name AND table_name = @table_name AND column_name = @column_name
			COMMIT
		
		FETCH NEXT FROM @cur INTO @table_name, @column_name, @data_type, @is_identity, @is_computed, @enable_data_masking
		END		

		CLOSE @cur;
		DEALLOCATE @cur;
		INSERT INTO dm_log VALUES (getdate(), '*** Finished Data Masking for database ' + @DB_Name + ' ***')	
	
	END TRY  

	BEGIN CATCH
			IF @@TRANCOUNT > 0 ROLLBACK;

			-- SELECT                                   
				-- ERROR_NUMBER() AS ErrorNumber                                        
				-- ,ERROR_SEVERITY() AS ErrorSeverity                                        
				-- ,ERROR_STATE() AS ErrorState                                        
				-- ,ERROR_PROCEDURE() AS ErrorProcedure                                        
				-- ,ERROR_LINE() AS ErrorLine                                        
			SET	@error_message = ERROR_MESSAGE()
			SET @text = 'Failed to mask table: ' + @DB_NAME + '.' + @table_name + ', Column: ' + @column_name + '. ' + @error_message

			INSERT INTO dm_log VALUES (getdate(), @text)
			PRINT(@text)
			CLOSE @cur;
			DEALLOCATE @cur;
		
	END CATCH
END
GO

4. Run data masking for specified database ('cetgroups3_dm')

USE [cetgroups3_dm]
GO

EXEC sp_data_masking 'cetgroups3_dm';

5. Monitor Data Masking Progress Status

SELECT
	ja.job_id,
	ja.session_id,
	j.name AS job_name,
	ja.start_execution_date,      
	ISNULL(last_executed_step_id,0)+1 AS current_executed_step_id,
	Js.step_name,
	DATEDIFF(MINUTE, ja.start_execution_date, GETDATE()) AS Duration_min
FROM msdb.dbo.sysjobactivity ja 
LEFT JOIN msdb.dbo.sysjobhistory jh ON ja.job_history_id = jh.instance_id
JOIN msdb.dbo.sysjobs j ON ja.job_id = j.job_id
JOIN msdb.dbo.sysjobsteps js
	ON ja.job_id = js.job_id
	AND ISNULL(ja.last_executed_step_id,0)+1 = js.step_id
WHERE
	ja.session_id = (
	SELECT TOP 1 session_id FROM msdb.dbo.syssessions ORDER BY agent_start_date DESC
	)
-- AND j.name = @Job_Name
AND start_execution_date is not null
AND stop_execution_date is null;

select count(*) from dm_log;

select * from dm_log order by time_date desc;