Search

Is using CLR to parser strings will always be a good solution

Sharon Rimer
Jan 15, 2017
5 min read

אחת לזמן מה, בדרך כלל מידיי חודש, אנו שולחים בחברה אתגר טכנולוגי לכל צוות הSQL Server בנאיה.

אני רוצה לשתף אתכם באתגר שעלה בחודש נובמבר 2016 ע"י עדי כהן.

מדוע אני חושב שהאתגר הזה ראוי לשיתוף הקהל הרחב?

מכיוון, שלהפתעתי הוא נגד את החשיבה הראשונית שאיתה הלכתי בתחילת האתגר, אני מאמין שכך גם לגבי המחשבה של אחרים.

האתגר שהוצג -

צריך לכתוב פרוצדורה, שמקבלת string בתור פרמטר. הstring יכול להיות מורכב מאותיות וסימנים (למשל #,$,! וכד') וכמובן מספרים. אפשר להניח שהאותיות יהיו אותיות באנגלית בלבד. הפרוצדורה אמורה לבדוק את הסטרינג ובכל מקום שיש בו לפחות 2 אותיות גדולות ברצף, להפוך את האותיות לאותיות קטנות, אבל לסמן את תחילת הרצף ואת סוף הרצף ב ו בהתאמה.

אני מצרף מספר משפטים לפני ואחרי הרצת הקוד שתוכלו להשתמש בהם לבדיקות הקוד:

This is a TEST. Hop that it will work -> This is atest. Hope that it will work.

SQL Server is a relational DB -> sql Server is a relational db

This one should not be modified -> This one should not be modified

Do U 2 have the CORRECT # For me? -> Do U 2 have the correct # For me?

האתגר נבדק מול מסד נתונים בSQL Server 2016– WideWorldImporters

ניתן להוריד מכאן

צורת הבדיקה של הפתרונות הייתה להפוך את הקוד לפונקציה סקלארית, ולהפעיל אותה במשפט SELECT על עמודת Description מטבלת Sales.OrderLines בבסיס הנתונים WideWorldImporters. הטבלה מכילה מעל 230000 רשומות, כך שניתן לראות בצורה די טובה את הבדלי הביצועים בין כל אחד מהדרכים.

פרמטרים שנבדקו זמן הריצה, וזמן הCPU.

כדי לוודא שעמדת נכון בתוצאה הלוגית ניתן להריץ

SELECT *,[dbo].[ufn_YourTry] (t.String)[MyAnswer]

FROM (SELECT 'This is a TEST. Hope that it will work.' [String], 'This is atest. Hope that it will work.'[Adi]

UNION ALL SELECT 'SQL Server is a relational DB' [String], 'sql Server is a relational db'[Adi]

UNION ALL SELECT'This one should not be modified', 'This one should not be modified'[Adi]

UNION ALL SELECT'Do U 2 have the CORRECT # For me?', 'Do U 2 have the correct # For me?'[Adi]

UNION ALL SELECT'Do U 2 HAVe the CORRECT # FOr me?', 'Do U 2 have the correct # For me?'[Adi]

UNION ALL SELECT'teST, test, TEST, tESt', 'test, test, test, test'[Adi]

)t

אשמח, אם תוכלו בשלב זה, לחשוב על פתרון שהייתם מגיעים לתשובה נכונה ותחת ביצועים טובים.

הכיוון שאני הלכתי אליו הוא –

הבעיה המוצגת, הינה בעיה הקשורה במניפולציה על String, תמיד האמנתי שדווקא בזה יש טכנולוגיות הרבה יותר מומלצות, כמו, מחלקות של .net למניפולציות על טקסט בעזרת ביטויים רגולריים.

וכך עשיתי – כתבתי 3 פונקציות עם שיטות שונות לטיפול בבעיה.

ה2 הראשונות לפני פרסום תוצאות האתגר. והשלישית לאחר שגיליתי שהפתרון לא היה מהיר מספיק.

using System;

using System.Data;

using System.Data.SqlClient;

using System.Data.SqlTypes;

using Microsoft.SqlServer.Server;

using System.Text.RegularExpressions;

using System.Linq;

using System.Text;

public partial class UserDefinedFunctions

{

[Microsoft.SqlServer.Server.SqlFunction(DataAccess = DataAccessKind.Read, IsDeterministic = true, Name = "ufn_clr_RegexReplace")]

public static SqlString ufn_clr_RegexReplace([SqlFacet(MaxSize = -1)]SqlString String)

{

string updatedString = string.Empty;

if (!String.IsNull)

{

updatedString = getAdi(String.ToString());

}

return new SqlString(updatedString);

}

//מטודה לטיפול בטקסט בשבל האתגר

static string getAdi(string value)

{

//פצל את הטקסט לשורות ע"פ ביטוי רגולרי שמפצל מילים בעלי אותיות גדולות בלבד

string[] lines = Regex.Split(value, @"\b(\w*[A-Z]\w*[A-Z])\b");

string[] words;

string answer = "";

foreach (string line in lines)

{

//אם המילה היא מילה גדולה הוסף את הלוגיקה של עדי

if (Regex.IsMatch(line, @"([A-Z]\w*[A-Z])"))

{

words = Regex.Split(line, @"([A-Z]\w*[A-Z])");

foreach (string word in words)

{

if (Regex.IsMatch(word, @"([A-Z]\w*[A-Z])"))

answer += "" + word.ToLower() + "";

else answer += word;

}

// אחרת, הוסף את הטקסט הרגיל

else answer += line;

}

return answer;

}

פתרון 2 –

using System;

using System.Data;

using System.Data.SqlClient;

using System.Data.SqlTypes;

using Microsoft.SqlServer.Server;

using System.Text.RegularExpressions;

using System.Linq;

using System.Text;

public partial class UserDefinedFunctions

{

[Microsoft.SqlServer.Server.SqlFunction(DataAccess = DataAccessKind.Read, IsDeterministic = true, Name = "ufn_clr_RegexReplace2")]

public static SqlString ufn_clr_RegexReplace2([SqlFacet(MaxSize = -1)]SqlString String)

{

string updatedString = string.Empty;

if (!String.IsNull)

{

updatedString = getAdi2(String.ToString());

}

return new SqlString(updatedString);

}

static string getAdi2(string value)

{

Regex reg = new Regex(@"(\p{Lu}{2,})+");

var matches = reg.Matches(value);

var list = matches.Cast<Match>().ToList();

StringBuilder sb = new StringBuilder();

if (matches.Count > 0)

{

int i = 0;

list.ForEach(f =>

{

sb.Append(value.Substring(i, f.Index - i) + string.Format("{0}", f.Value.ToLower()));

i = f.Index + f.Value.Length;

});

if (list.Last().Index + list.Last().Value.Length != value.Length)

{

sb.Append(value.Substring((list.Last().Index + list.Last().Value.Length), value.Length - (list.Last().Index + list.Last().Value.Length)));

}

return sb.ToString();

}

return value;

}

הפתרון הטוב ביותר היה של עדי כהן,

הפתרון מוצג כפי שכתוב כאן – (הסבר מטה)

CREATE FUNCTION dbo.Adi

(

@Org NVARCHAR(4000)

)

RETURNS NVARCHAR(4000)

AS

BEGIN

declare @Results varchar(200) = ''

declare @ChIndex int

--Getting the first occurence of more then 1 capital leter

WHILE patindex('%[A-Z][A-Z]%' COLLATE Latin1_General_BIN, @Org) > 0

BEGIN

--Get the location of the first place with more then 1 capital letter

select @ChIndex = patindex('%[A-Z][A-Z]%' COLLATE Latin1_General_BIN, @Org)

--Adding to the results string the part that doesn't need to be modified

set @Results = @Results + substring(@Org,1, @ChIndex - 1) + ''

--Deleting the part that was added to the results from the original string

set @Org = substring(@Org,@ChIndex,len(@Org))

--Getting the end location of the capital letters

select @ChIndex = patindex('%[^A-Z]%' COLLATE Latin1_General_BIN,@Org)

IF @ChIndex = 0

set @ChIndex = len(@Org)+1

--Adding the needed part to the @results string

SELECT @Results = @Results + lower(substring(@Org,1,@ChIndex - 1)) + ''

--Deleting the part from the @Org string

set @Org = substring(@Org,@ChIndex,len(@Org))

END

SET @Results = @Results + @Org

RETURN(@Results)

END

זמן ריצה בשניות - 1.385

זמן CPU - 1.312

הסבר –

הפתרון הוא בTSQL - ביצוע לולאה ע"מ למצוא את כל המקומות שבהם יש רצף של אותיות גדולות צמודות אחת לשנייה. הלולאה לא רצה על כל תו בטקסט, אלה משתמשת בפונקציה patindex ושימוש בחיפוש בטקסט לפי טווחים. הלולאה רצה רק כמספר הפעמים שבהם יש בטקסט רצף של לפחות שתי אותיות גדולות. אם ניקח למשל את הטקסט הבא:

My dream is to be retiree

הקוד בתוך הלולאות לא יופעל בכלל. ההבדל הזה גרם לכך שהקוד שלי יהיה יותר יעיל ולכן ירוץ יותר מהר.

לא התאכזבתי מהCLR והחלטתי למממש פתרון שלישי שמבוסס על הפתרון של עדי שכתב בTSQL.

using System;

using System.Data;

using System.Data.SqlClient;

using System.Data.SqlTypes;

using Microsoft.SqlServer.Server;

using System.Text.RegularExpressions;

using System.Linq;

using System.Text;

public partial class UserDefinedFunctions

{

[Microsoft.SqlServer.Server.SqlFunction(DataAccess = DataAccessKind.Read, IsDeterministic = true, Name = "ufn_clr_RegexReplace3")]

public static SqlString ufn_clr_RegexReplace3([SqlFacet(MaxSize = -1)]SqlString String)

{

string updatedString = string.Empty;

if (!String.IsNull)

{

updatedString = getAdi3(String.ToString());

}

return new SqlString(updatedString);

}

//מטודה לטיפול בטקסט בשבל האתגר

static string getAdi3(string value)

{

//פצל את הטקסט לשורות ע"פ ביטוי רגולרי שמפצל מילים בעלי אותיות גדולות בלבד

string answer = "";

int i;

Regex rx = new Regex(@"(\p{Lu}{2,})+");

Match result;

while (Regex.IsMatch(value, @"(\p{Lu}{2,})+"))

{

result = rx.Match(value);

i = value.IndexOf(result.ToString());

if (i == 0)

answer += "" + result.ToString().ToLower() + "";

else answer += value.Substring(0, i) + "" + result.ToString().ToLower() + "";

value = value.Substring(i + result.Length, value.Length - result.Length - i);

}

answer += value;

return answer;

}

אך, גם כאן זמן הריצה לקח כ6 שניות.

ניתן להוריד את כל פתרונות הCLR כאן

כל הפתרונות מבוססי CLR לקחו בממוצע 6 שניות.

צריך לזכור, clr הוא אפשרות מעולה לכל מיני בעיות, אך חשוב לנסות דרכים נוספות גם אם אתה בטוח בדרך שלך.

תודה רבה לעדי כהן, על האתגרים מידי חודש.

האם תוכלו למצוא דרך מהירה מספיק לפתרון בדרך שונה מהמוצג במאמר?

#SQLServerCLR #SQLServer2016 #WideWorldImporters #SharonRimer #שרוןרימר #עדיכהן #AdiCohn #tSQLt #SQLServer

Comments

Check back soon

Once posts are published, you’ll see them here.

Is using CLR to parser strings will always be a good solution

Comments

Featured Posts

Recent Posts

Snowflake - Getting Started Guide

Restoring SQL Server System Databases

Amazon EMR Cluster to Athena Partitioned Data - Quickly and Simply !

MongoDB Backup Data Directory

Moving Always On DBs without Restarting SQL

MariaDB Galera Cluster Installation

Create Logins On Log Shipping Databases

A first timer PASS summit 2017 journey

A rich man's problem…

Changing INT to BigInt

Archive

Search By Tags

Follow Us