SQL Server 2008 and HashBytes
Solution 1
Just use this function (taken from Hashing large data strings with a User Defined Function):
create function dbo.fn_hashbytesMAX
( @string nvarchar(max)
, @Algo varchar(10)
)
returns varbinary(20)
as
/************************************************************
*
* Author: Brandon Galderisi
* Last modified: 15-SEP-2009 (by Denis)
* Purpose: uses the system function hashbytes as well
* as sys.fn_varbintohexstr to split an
* nvarchar(max) string and hash in 8000 byte
* chunks hashing each 8000 byte chunk,,
* getting the 40 byte output, streaming each
* 40 byte output into a string then hashing
* that string.
*
*************************************************************/
begin
declare @concat nvarchar(max)
,@NumHash int
,@HASH varbinary(20)
set @NumHash = ceiling((datalength(@string)/2)/(4000.0))
/* HashBytes only supports 8000 bytes so split the string if it is larger */
if @NumHash>1
begin
-- # * 4000 character strings
;with a as (select 1 as n union all select 1) -- 2
,b as (select 1 as n from a ,a a1) -- 4
,c as (select 1 as n from b ,b b1) -- 16
,d as (select 1 as n from c ,c c1) -- 256
,e as (select 1 as n from d ,d d1) -- 65,536
,f as (select 1 as n from e ,e e1) -- 4,294,967,296 = 17+ TRILLION characters
,factored as (select row_number() over (order by n) rn from f)
,factors as (select rn,(rn*4000)+1 factor from factored)
select @concat = cast((
select right(sys.fn_varbintohexstr
(
hashbytes(@Algo, substring(@string, factor - 4000, 4000))
)
, 40) + ''
from Factors
where rn <= @NumHash
for xml path('')
) as nvarchar(max))
set @HASH = dbo.fn_hashbytesMAX(@concat ,@Algo)
end
else
begin
set @HASH = convert(varbinary(20), hashbytes(@Algo, @string))
end
return @HASH
end
And the results are as following:
select
hashbytes('sha1', N'test') --native function with nvarchar input
,hashbytes('sha1', 'test') --native function with varchar input
,dbo.fn_hashbytesMAX('test', 'sha1') --Galderisi's function which casts to nvarchar input
,dbo.fnGetHash('sha1', 'test') --your function
Output:
0x87F8ED9157125FFC4DA9E06A7B8011AD80A53FE1
0xA94A8FE5CCB19BA61C4C0873D391E987982FBBD3
0x87F8ED9157125FFC4DA9E06A7B8011AD80A53FE1
0x00000000AE6DBA4E0F767D06A97038B0C24ED720662ED9F1
Solution 2
If you can't create a function and have to use something that already exists in the DB:
sys.fn_repl_hash_binary
can be made to work using the syntax:
sys.fn_repl_hash_binary(cast('some really long string' as varbinary(max)))
Taken from: http://www.sqlnotes.info/2012/01/16/generate-md5-value-from-big-data/
Solution 3
I've taken the accepted answer, and modified it a bit with the following improvements:
- no longer recursive function
- now schema bound
- no longer relying on undocumented stored procedures
- two versions: one for nvarchar, one for varchar
- returns same data size as HASHBYTES, leaving it up to the end user to convert to smaller based on algorithm used. This allows the functions to support future algorithms with larger data returns.
With these changes, the functions can now be used in persisted computed columns as they are now marked deterministic when created.
CREATE FUNCTION dbo.fnHashBytesNVARCHARMAX
(
@Algorithm VARCHAR(10),
@Text NVARCHAR(MAX)
)
RETURNS VARBINARY(8000)
WITH SCHEMABINDING
AS
BEGIN
DECLARE @NumHash INT;
DECLARE @HASH VARBINARY(8000);
SET @NumHash = CEILING(DATALENGTH(@Text) / (8000.0));
/* HashBytes only supports 8000 bytes so split the string if it is larger */
WHILE @NumHash > 1
BEGIN
-- # * 4000 character strings
WITH a AS
(SELECT 1 AS n UNION ALL SELECT 1), -- 2
b AS
(SELECT 1 AS n FROM a, a a1), -- 4
c AS
(SELECT 1 AS n FROM b, b b1), -- 16
d AS
(SELECT 1 AS n FROM c, c c1), -- 256
e AS
(SELECT 1 AS n FROM d, d d1), -- 65,536
f AS
(SELECT 1 AS n FROM e, e e1), -- 4,294,967,296 = 17+ TRILLION characters
factored AS
(SELECT ROW_NUMBER() OVER (ORDER BY n) rn FROM f),
factors AS
(SELECT rn, (rn * 4000) + 1 factor FROM factored)
SELECT @Text = CAST
(
(
SELECT CONVERT(VARCHAR(MAX), HASHBYTES(@Algorithm, SUBSTRING(@Text, factor - 4000, 4000)), 1)
FROM factors
WHERE rn <= @NumHash
FOR XML PATH('')
) AS NVARCHAR(MAX)
);
SET @NumHash = CEILING(DATALENGTH(@Text) / (8000.0));
END;
SET @HASH = CONVERT(VARBINARY(8000), HASHBYTES(@Algorithm, @Text));
RETURN @HASH;
END;
CREATE FUNCTION dbo.fnHashBytesVARCHARMAX
(
@Algorithm VARCHAR(10),
@Text VARCHAR(MAX)
)
RETURNS VARBINARY(8000)
WITH SCHEMABINDING
AS
BEGIN
DECLARE @NumHash INT;
DECLARE @HASH VARBINARY(8000);
SET @NumHash = CEILING(DATALENGTH(@Text) / (8000.0));
/* HashBytes only supports 8000 bytes so split the string if it is larger */
WHILE @NumHash > 1
BEGIN
-- # * 4000 character strings
WITH a AS
(SELECT 1 AS n UNION ALL SELECT 1), -- 2
b AS
(SELECT 1 AS n FROM a, a a1), -- 4
c AS
(SELECT 1 AS n FROM b, b b1), -- 16
d AS
(SELECT 1 AS n FROM c, c c1), -- 256
e AS
(SELECT 1 AS n FROM d, d d1), -- 65,536
f AS
(SELECT 1 AS n FROM e, e e1), -- 4,294,967,296 = 17+ TRILLION characters
factored AS
(SELECT ROW_NUMBER() OVER (ORDER BY n) rn FROM f),
factors AS
(SELECT rn, (rn * 8000) + 1 factor FROM factored)
SELECT @Text = CAST
(
(
SELECT CONVERT(VARCHAR(MAX), HASHBYTES(@Algorithm, SUBSTRING(@Text, factor - 8000, 8000)), 1)
FROM factors
WHERE rn <= @NumHash
FOR XML PATH('')
) AS NVARCHAR(MAX)
);
SET @NumHash = CEILING(DATALENGTH(@Text) / (8000.0));
END;
SET @HASH = CONVERT(VARBINARY(8000), HASHBYTES(@Algorithm, @Text));
RETURN @HASH;
END;
Solution 4
tested and working select master.sys.fn_repl_hash_binary(someVarbinaryMaxValue) moreover not complicated :)
Solution 5
You could write a SQL CLR function:
[Microsoft.SqlServer.Server.SqlFunction]
public static SqlBinary BigHashBytes(SqlString algorithm, SqlString data)
{
var algo = HashAlgorithm.Create(algorithm.Value);
var bytes = Encoding.UTF8.GetBytes(data.Value);
return new SqlBinary(algo.ComputeHash(bytes));
}
And then it can be called in SQL like this:
--these return the same value
select HASHBYTES('md5', 'test stuff')
select dbo.BigHashBytes('md5', 'test stuff')
The BigHashBytes
is only necessary if the length would be over 8k.
Comments
-
Bob about 2 years
I have quite a large nvarchar which I wish to pass to the HashBytes function. I get the error:
"String or binary would be truncated. Cannot insert the value NULL into column 'colname', tbale 'table'; column does not allow nulls. UPDATE fails. The statement has been terminated."
Being ever resourceful, I discovered this was due to the HashBytes function having a maximum limit of 8000 bytes. Further searching showed me a 'solution' where my large varchar would be divided and hashed seperately and then later combined with this user defined function:
function [dbo].[udfLargeHashTable] (@algorithm nvarchar(4), @InputDataString varchar(MAX)) RETURNS varbinary(MAX) AS BEGIN DECLARE @Index int, @InputDataLength int, @ReturnSum varbinary(max), @InputData varbinary(max) SET @ReturnSum = 0 SET @Index = 1 SET @InputData = convert(binary,@InputDataString) SET @InputDataLength = DATALENGTH(@InputData) WHILE @Index <= @InputDataLength BEGIN SET @ReturnSum = @ReturnSum + HASHBYTES(@algorithm, SUBSTRING(@InputData, @Index, 8000)) SET @Index = @Index + 8000 END RETURN @ReturnSum END
which I call with:
set @ReportDefinitionHash=convert(int,dbo.[udfLargeHashTable]('SHA1',@ReportDefinitionForLookup))
Where @ReportDefinitionHash is int, and @ReportDefinitionForLookup is the varchar
Passing a simple char like 'test' produces a different int with my UDF than a normal call to HashBytes would produce.
Any advice on this issue?
-
irag10 almost 11 yearsI think there's a bug here. Calling
dbo.fn_hashbytesMAX()
with large values results in the same hash. Looks to me that the@string
parameter type needs to benvarchar(max)
rather thanvarchar(max)
, otherwise halving thedatalength()
result doesn't make sense. As it is,datalength(@string)/2
means it's only hashing half as many substrings as it should. -
irag10 almost 11 yearsNB: only available from SQL Server 2008 onwards
-
irag10 almost 11 yearsI see originally the function provided was for
nvarchar(max)
input and was changed. Anyone using this should either change the@string
datatype tonvarchar(max)
or change the code to work correctly (which probably means change other nvarchar to varchar and removing the/2
, but you'd want to test) -
irag10 almost 11 yearsI've edited the answer as per my previous comments - now takes and computes using nvarchar. Won't output same value as hashbytes() if passed a varchar value as the argument is first cast to nvarchar. Changed to return varbinary so calling with md5 algorithm returns correct length.
-
gotqn about 10 yearsWon't work if you have utf-8 data -
NVARCHAR
string -
Michael J Swart over 9 yearsSQL Server doesn't use utf-8 strings. I had no problems with NVARCHAR strings.
-
Solomon Rutzky about 9 years1) String data in SQL Server is stored as UTF-16 Little Endian, which equates to "Unicode" in .NET. 2) You don't need to mess with
Encoding.<encoding_name>
since SqlString can just give you the Unicode byte[] via SqlString.GetUnicodeBytes. -
rshadman about 9 yearsPlease see my answer below for a modified version of this solution that can be used with persisted computed columns.