Always Encrypted Storage Overhead

The new Always Encrypted feature in SQL Server 2016 adds some overhead to the storage requirements of a column. I wanted to do some analysis to determine just how much overhead is incurred for various data types. This test was run against CTP 3.2.

Methodology

After creating a database along with a column master key and the column encryption key, I created four tables to store the encrypted data.

if exists (select * from sys.tables where name = 'VarcharAnalysis')
	drop table VarcharAnalysis;
create table VarcharAnalysis
(
	CharCount int null,
	CharData varchar(8000) collate Latin1_General_BIN2
		encrypted with (column_encryption_key = [CEK_Auto1], 
		encryption_type = Deterministic, 
		algorithm = 'AEAD_AES_256_CBC_HMAC_SHA_256') null
);
 
create clustered index cl_VarcharAnalysis on VarcharAnalysis (CharCount);
 
if exists (select * from sys.tables where name = 'NVarcharAnalysis')
	drop table NVarcharAnalysis;
create table NVarcharAnalysis
(
	CharCount int null,
	CharData nvarchar(4000) collate Latin1_General_BIN2
		encrypted with (column_encryption_key = [CEK_Auto1], 
		encryption_type = Deterministic, 
		algorithm = 'AEAD_AES_256_CBC_HMAC_SHA_256') null
);
 
create clustered index cl_NVarcharAnalysis on NVarcharAnalysis (CharCount);
 
if exists (select * from sys.tables where name = 'VarbinaryAnalysis')
	drop table VarbinaryAnalysis;
create table VarbinaryAnalysis
(
	ByteCount int null,
	BinaryData varbinary(8000)
		encrypted with (column_encryption_key = [CEK_Auto1], 
		encryption_type = Deterministic, 
		algorithm = 'AEAD_AES_256_CBC_HMAC_SHA_256') null
);
 
create clustered index cl_VarbinaryAnalysis on VarbinaryAnalysis (ByteCount);
 
if exists (select * from sys.tables where name = 'FixedSizeAnalysis')
	drop table FixedSizeAnalysis;
create table FixedSizeAnalysis
(
	TinyIntData tinyint
		encrypted with (column_encryption_key = [CEK_Auto1], 
		encryption_type = Deterministic, 
		algorithm = 'AEAD_AES_256_CBC_HMAC_SHA_256') null,
	SmallIntData smallint
		encrypted with (column_encryption_key = [CEK_Auto1], 
		encryption_type = Deterministic, 
		algorithm = 'AEAD_AES_256_CBC_HMAC_SHA_256') null,
	IntData int
		encrypted with (column_encryption_key = [CEK_Auto1], 
		encryption_type = Deterministic, 
		algorithm = 'AEAD_AES_256_CBC_HMAC_SHA_256') null,
	BigIntData bigint
		encrypted with (column_encryption_key = [CEK_Auto1], 
		encryption_type = Deterministic, 
		algorithm = 'AEAD_AES_256_CBC_HMAC_SHA_256') null,
	DecimalData decimal(38,0)
		encrypted with (column_encryption_key = [CEK_Auto1], 
		encryption_type = Deterministic, 
		algorithm = 'AEAD_AES_256_CBC_HMAC_SHA_256') null,
	BitData bit
		encrypted with (column_encryption_key = [CEK_Auto1], 
		encryption_type = Deterministic, 
		algorithm = 'AEAD_AES_256_CBC_HMAC_SHA_256') null,
	DateTimeData datetime
		encrypted with (column_encryption_key = [CEK_Auto1], 
		encryption_type = Deterministic, 
		algorithm = 'AEAD_AES_256_CBC_HMAC_SHA_256') null,
	DateTime2Data datetime2
		encrypted with (column_encryption_key = [CEK_Auto1], 
		encryption_type = Deterministic, 
		algorithm = 'AEAD_AES_256_CBC_HMAC_SHA_256') null,
	GuidData uniqueidentifier
		encrypted with (column_encryption_key = [CEK_Auto1], 
		encryption_type = Deterministic, 
		algorithm = 'AEAD_AES_256_CBC_HMAC_SHA_256') null,
);

I then created a small C# application to first insert data into the tables, and then to extract the encrypted values out of the tables.

using (Dal dal = new Dal(instanceName, databaseName, userName, password, true))
{
	dal.InsertVarcharAnalysis(null);
	for (int i = 0; i <= 8000; i++)
	{
		string data = new string('*', i);
		dal.InsertVarcharAnalysis(data);
	}
}
 
using (Dal dal = new Dal(instanceName, databaseName, userName, password, true))
{
	dal.InsertNVarcharAnalysis(null);
	for (int i = 0; i <= 4000; i++)
	{
		string data = new string('*', i);
		dal.InsertNVarcharAnalysis(data);
	}
}
 
using (Dal dal = new Dal(instanceName, databaseName, userName, password, true))
{
	dal.InsertVarbinaryAnalysis(null);
	byte[] template = new byte[8000];
	Enumerable.Range(0, 8000).ToList().ForEach(i => template[i] = (byte)(i % 256));
	for (int i = 0; i <= 8000; i++)
	{
		byte[] data = new byte[i];
		Array.Copy(template, data, i);
		dal.InsertVarbinaryAnalysis(data);
	}
}
 
 
using (Dal dal = new Dal(instanceName, databaseName, userName, password, true))
{
	dal.InsertFixedSizeAnalysis(null, null, null, null, null, null, null, null, null);
	dal.InsertFixedSizeAnalysis(byte.MaxValue, short.MaxValue, int.MaxValue, long.MaxValue, decimal.MaxValue, true,
			DateTime.Now, DateTime.Now, Guid.NewGuid());
}

In my Dal object’s constructor, the final parameter is a boolean value indicating whether to set the Column Encryption Setting to “Enabled”. This is necessary because the inserts must be done when encryption is enabled, but we need to read back the tables with encryption disabled so that we get back the raw, encrypted bytes instead of the decrypted data.

The analysis code looks like this:

using (Dal dal = new Dal(instanceName, databaseName, userName, password, false))
{
	List records = dal.ReadVarcharAnalysis();
	int? previousRecordSize = int.MinValue;
	foreach (CharDataRecord record in records)
	{
		if (record.Data?.Length != previousRecordSize)
		{
			Console.WriteLine("{0} - {1}", record.Length, record.Data?.Length);
			previousRecordSize = record.Data?.Length;
		}
	}
}
 
using (Dal dal = new Dal(instanceName, databaseName, userName, password, false))
{
	List records = dal.ReadNVarcharAnalysis();
	int? previousRecordSize = int.MinValue;
	foreach (CharDataRecord record in records)
	{
		if (record.Data?.Length != previousRecordSize)
		{
			Console.WriteLine("{0} - {1}", record.Length, record.Data?.Length);
			previousRecordSize = record.Data?.Length;
		}
	}
}
 
using (Dal dal = new Dal(instanceName, databaseName, userName, password, false))
{
	List records = dal.ReadVarbinaryAnalysis();
	int? previousRecordSize = int.MinValue;
	foreach (CharDataRecord record in records)
	{
		if (record.Data?.Length != previousRecordSize)
		{
			Console.WriteLine("{0} - {1}", record.Length, record.Data?.Length);
			previousRecordSize = record.Data?.Length;
		}
	}
}
 
using (Dal dal = new Dal(instanceName, databaseName, userName, password, false))
{
	List records = dal.ReadFixedSizeAnalysis();
	foreach (FixedSizeDataRecord record in records)
	{
		Console.WriteLine("TinyInt: {0}", record.TinyIntData?.Length);
		Console.WriteLine("SmallInt: {0}", record.SmallIntData?.Length);
		Console.WriteLine("Int: {0}", record.IntData?.Length);
		Console.WriteLine("BigInt: {0}", record.BigIntData?.Length);
		Console.WriteLine("Decimal: {0}", record.DecimalData?.Length);
		Console.WriteLine("Bit: {0}", record.BitData?.Length);
		Console.WriteLine("DateTime: {0}", record.DateTimeData?.Length);
		Console.WriteLine("DateTime2: {0}", record.DateTime2Data?.Length);
		Console.WriteLine("Guid: {0}", record.GuidData?.Length);
	}
}

Results

Let’s start by looking at the varchar data. Here are the first few rows in the table:

VarcharAnalysisTopRows

One thing to note is that, not surprisingly, a NULL input to the encryption function yields a NULL output This was true everywhere in this test. However, a blank string (0-length) generates 65 bytes of output, as does an input containing 1 character, 2 characters, etc. At 16 characters of input, the output grows to 81 bytes, and when input reaches 32 characters the output is 97 bytes.

Input Size (characters)Output Size (bytes)
065
1681
3297
48113
64129
80145
96161
112177
128193

The pattern is quickly apparent. We can express this relationship very simply as:

C = 16 * floor(P / 16) + 65

where P is the input plaintext size in characters and C is the output cryptotext size in bytes.

Next, the nvarchar data. This has similar growth patterns but, not surprisingly, the encryption output grows at 8-character boundaries.

Input Size (characters)Output Size (bytes)
065
881
1697
24113
32129
40145
48161
56177
64193
72209

C = 16 * floor(P / 8) + 65

This is the same result as before taking into account that each input character is now two bytes instead of one.

Varbinary data, as you might expect, follows the same pattern as varchar data:

Input Size (characters)Output Size (bytes)
065
1681
3297
48113
64129
80145
96161
112177
128193

Finally, let’s look at fixed-width data. It turns out that the tinyint, smallint, int and bigint data types now occupy 65 bytes, as do bit, datetime and datetime2 columns. The decimal and uniqueidentifier data types require 81 bytes when they are encrypted. Again, this fits our pattern we have observed so far.

Data TypeUnencrypted (bytes)Encrypted (bytes)
tinyint165
smallin265
int465
bigint865
decimal(38,0)1781
bit0.12565
datetime865
datetime2865
uniqueidentifier1681

Conclusions

The size of the encrypted data throughout this analysis is very consistently

C = 16 * floor(P / 16) + 65

where P is the size, in bytes, of the unencrypted data, and C is the resulting size, again in bytes, of the encrypted data. This means that small inputs incur a significant amount of overhead, but as the input grows the overhead becomes only a tiny fraction. The greatest overhead is observed with the bit data type, where the encrypted data size is 520 times the unencrypted data size.

Coda

You may have noticed that I created all of the tables with the encryption type set to Deterministic. I re-ran this test using Randomized encryption throughout, and the result are identical.

Observe that these results are based on the encrypted data size after the initial insert into the table. I’m not sure if there is anything later in the process, such as key rotation, that may cause the data size to grow, so keep that limitation in mind.

Always Encrypted and sys.dm_exec_describe_first_result_set

Suppose we have the following table created in SQL Server 2016 CTP 3.2 (presuming that the encryption keys have already been created):

create table dbo.Customer
(
	CustomerID int NULL,
	FirstName varchar(50) collate Latin1_General_BIN2
		ENCRYPTED WITH (COLUMN_ENCRYPTION_KEY = CEK_Auto1,
		ENCRYPTION_TYPE = Deterministic,
		ALGORITHM = 'AEAD_AES_256_CBC_HMAC_SHA_256') NULL,
	LastName varchar(50) collate Latin1_General_BIN2
		ENCRYPTED WITH (COLUMN_ENCRYPTION_KEY = CEK_Auto1,
		ENCRYPTION_TYPE = Deterministic,
		ALGORITHM = 'AEAD_AES_256_CBC_HMAC_SHA_256') NULL,
	Address varchar(50) NULL,
	City varchar(50) NULL,
	State varchar(50) NULL
);

Now suppose we connect to the database using a connection where Column Encryption Setting=Disabled and execute:

select column_ordinal, name, system_type_name
from sys.dm_exec_describe_first_result_set(N'select * from dbo.Customer', null, null);

Then we get the following results:

DescribeFirstResultSet_EncryptionDisabled

The first and last name columns, rather than coming back as varchar(50) are instead shown as varbinary(113). This isn’t too surprising, given that we are on an unencrypted connection, and if we actually execute the select against the Customer table we will, in fact, get back binary data.

So surely if we use a connection where Column Encryption Setting=Enabled, we will get the correct data type back, right?

Nope.

DescribeFirstResultSet_EncryptionEnabled

We still get back varbinary(113).

I’m not sure how I feel about these results. On the one hand, the results are consistent regardless of the column encryption setting, and they do reflect the true data type at the storage engine level. However, the real usefulness of sys.dm_exec_describe_first_result_set (and its companion stored procedure sys.sp_describe_first_result_set, which gives identical results) is the ability to discover the data types coming back from a particular T-SQL statement. If used as intended (that is, on a connection with the column encryption setting enabled), the metadata coming back doesn’t match what will be returned by the actual query, significantly reducing the value of this DMF.

The official documentation for the DMF doesn’t address the impact of Always Encrypted, and the wording is sufficiently vague (“describes the metadata”) that it’s hard to say what the “correct” results should be in this case.

Of course, it could very well be something that Microsoft is already planning to address before RTM.

Defeating Dynamic Data Masking

I can’t recall exactly where or who said it, but at some point over this past summer I recall someone mentioning that the new dynamic data masking feature in SQL Server 2016 could be easily worked around by selecting the data into a temporary table.  I shelved that thought for several months, and then this past weekend I started playing with it, only to find that the issue had been fixed in CTP 2.4.  However, I wanted to document some of the things that I tried just to show that it isn’t so easy any more to defeat the benefits of this new feature.  These tests were run in CTP 3.2.

Let’s start with the sample table from Books Online:

CREATE TABLE Membership
  (MemberID int IDENTITY PRIMARY KEY,
   FirstName varchar(100) NULL,
   LastName varchar(100) NOT NULL,
   Phone# varchar(12) NULL,
   EMail varchar(100) NULL);
 
ALTER TABLE Membership ALTER COLUMN FirstName ADD MASKED WITH (FUNCTION = 'partial(1,"XXXXXXX",0)');
ALTER TABLE Membership ALTER COLUMN Phone# ADD MASKED WITH (FUNCTION = 'default()');
ALTER TABLE Membership ALTER COLUMN Email ADD MASKED WITH (FUNCTION = 'email()');
 
INSERT Membership (FirstName, LastName, Phone#, EMail) VALUES 
('Roberto', 'Tamburello', '555.123.4567', 'RTamburello@contoso.com'),
('Janice', 'Galvin', '555.123.4568', 'JGalvin@contoso.com.co'),
('Zheng', 'Mu', '555.123.4569', 'ZMu@contoso.net');
 
CREATE USER TestUser WITHOUT LOGIN;
GRANT SELECT ON Membership TO TestUser;

We can test that DDM is working by running:

EXECUTE AS USER = 'TestUser';
SELECT * FROM Membership;
REVERT;

The results are:

ddmResults1

Next, we will try to select the results into a temporary table and then select from the temp table.

EXECUTE AS USER = 'TestUser';
if object_id('tempdb..#Membership') is not null
	drop table #Membership;
SELECT * into #Membership FROM Membership;
select * from #Membership;
REVERT;

The results are identical.

ddmResults2

In some cases attributes of the table get copied as part of a SELECT … INTO statement, so the next test is to explicitly create the temp table:

EXECUTE AS USER = 'TestUser';
if object_id('tempdb..#Membership') is not null
	drop table #Membership;
CREATE TABLE #Membership
  (MemberID int IDENTITY PRIMARY KEY,
   FirstName varchar(100) NULL,
   LastName varchar(100) NOT NULL,
   Phone# varchar(12) NULL,
   EMail varchar(100) NULL);
 
set identity_insert #Membership on;
 
insert #Membership (MemberID, FirstName, LastName, [Phone#], Email)
SELECT MemberID, FirstName, LastName, [Phone#], Email FROM Membership;
 
set identity_insert #Membership off;
 
select * from #Membership;
REVERT;

The results are identical:

ddmResults3

What if we do something very similar, but use one of the columns in an expression?

EXECUTE AS USER = 'TestUser';
if object_id('tempdb..#Membership') is not null
	drop table #Membership;
CREATE TABLE #Membership
  (MemberID int IDENTITY PRIMARY KEY,
   FirstName varchar(100) NULL,
   LastName varchar(100) NOT NULL,
   Phone# varchar(12) NULL,
   EMail varchar(100) NULL);
set identity_insert #Membership on;
 
insert #Membership (MemberID, FirstName, LastName, [Phone#], Email)
SELECT MemberID, FirstName + '', LastName, [Phone#], Email FROM Membership; 
set identity_insert #Membership off;
select * from #Membership;
REVERT;

ddmResults4

Interestingly, this strips the first character from the results, yielding only “xxxx” in the FirstName column!

We can try and see if the “masked” attribute was copied to the temp table created as a SELECT … INTO statement.

EXECUTE AS USER = 'TestUser';
if object_id('tempdb..#Membership') is not null
	drop table #Membership;
SELECT * into #Membership FROM Membership;
 
alter table #Membership alter column FirstName drop masked;
 
select * from #Membership;
REVERT;

Msg 16007, Level 16, State 0, Line 32
The column 'FirstName' does not have a data masking function.

Nope, the masking attribute doesn’t get copied to the temp table.

Next, we try selecting into a variable first, then seeing what is in the variable:

EXECUTE AS USER = 'TestUser';
declare @FirstName varchar(100);
select @FirstName = FirstName from Membership where MemberID = 1;
select @FirstName;
REVERT;

Now we get:

ddmResults5

Let’s select the characters individually. Based on what we’ve seen to this point, this probably isn’t going to work.

EXECUTE AS USER = 'TestUser';
declare @c1 char(1), @c2 char(1), @c3 char(1), @c4 char(1), @c5 char(1), @c6 char(1), @c7 char(1)
select @c1 = substring(FirstName, 1, 1), @c2 = substring(FirstName, 2, 1), @c3 = substring(FirstName, 3, 1), @c4 = substring(FirstName, 4, 1),
	@c5 = substring(FirstName, 5, 1), @c6 = substring(FirstName, 6, 1), @c7 = substring(FirstName, 7, 1)
from Membership where MemberID = 1;
select @c1, @c2, @c3, @c4, @c5, @c6, @c7;
REVERT;

And indeed we only get:

ddmResults6

Finally, we will create another database and create a user as database owner:

create database AnotherDatabase;
GO
use AnotherDatabase;
 
create login AnotherDBUser with password = 'B@dP@$$w0rd';
create user AnotherDBUser for login AnotherDBUser;
 
alter role db_owner add member AnotherDBUser;

Now switch back to the database containing the member table, and grant this login read-only access.

create user AnotherDBUser for login AnotherDBUser;
alter role [db_datareader] add member AnotherDBUser;

And try it out:

execute as login = 'AnotherDBUser';
SELECT * FROM Membership;
REVERT;
 
execute as user = 'AnotherDBUser';
SELECT * FROM Membership;
REVERT;

Both queries return the same masked result set.

ddmResults7