-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path57_Regular_Expressions.sql
More file actions
510 lines (450 loc) · 12.8 KB
/
57_Regular_Expressions.sql
File metadata and controls
510 lines (450 loc) · 12.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
/**************************************************************
* SQL Server 2025 Regular Expressions Tutorial
* Description: This script demonstrates regular expression support
* introduced in SQL Server 2025 (17.x). It covers:
* - REGEXP_LIKE for pattern matching
* - REGEXP_REPLACE for search and replace
* - REGEXP_SUBSTR for extraction
* - REGEXP_INSTR for position finding
* - REGEXP_COUNT for counting matches
* - REGEXP_MATCHES for capturing groups
* - REGEXP_SPLIT_TO_TABLE for splitting strings
* - Real-world examples for validation and parsing
**************************************************************/
-------------------------------------------------
-- Region: 1. Introduction and Setup
-------------------------------------------------
USE master;
GO
/*
Create a test database for our regular expression examples.
*/
IF DB_ID('RegexDemo') IS NOT NULL
BEGIN
ALTER DATABASE RegexDemo SET SINGLE_USER WITH ROLLBACK IMMEDIATE;
DROP DATABASE RegexDemo;
END
GO
CREATE DATABASE RegexDemo;
GO
USE RegexDemo;
GO
-- Set compatibility level for SQL Server 2025 (17.x)
ALTER DATABASE RegexDemo SET COMPATIBILITY_LEVEL = 170;
GO
-------------------------------------------------
-- Region: 2. REGEXP_LIKE - Pattern Matching
-------------------------------------------------
/*
REGEXP_LIKE returns TRUE (1) if the text matches the regex pattern, FALSE (0) otherwise.
Useful for validation and filtering.
*/
-- 2.1 Create a sample table with various text patterns
CREATE TABLE dbo.CustomerData
(
CustomerID INT IDENTITY(1,1) PRIMARY KEY,
Email NVARCHAR(100),
PhoneNumber NVARCHAR(50),
PostalCode NVARCHAR(20),
ProductCode NVARCHAR(50)
);
GO
-- Insert sample data
INSERT INTO dbo.CustomerData (Email, PhoneNumber, PostalCode, ProductCode)
VALUES
('john.doe@example.com', '555-1234', '12345', 'PRD-2025-001'),
('invalid.email@', '(555) 123-4567', 'AB12 3CD', 'PRD-2024-XYZ'),
('jane.smith@company.net', '555.123.4567', '12345-6789', 'PROD-99-ABC'),
('bob@test', '5551234567', 'INVALID', 'PRD-2025-002'),
('alice.jones@domain.co.uk', '+1-555-123-4567', 'SW1A 1AA', 'PRD-2025-003');
GO
-- 2.2 Validate email addresses
SELECT
CustomerID,
Email,
CASE
WHEN REGEXP_LIKE(Email, '^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$') = 1
THEN 'Valid'
ELSE 'Invalid'
END AS EmailStatus
FROM dbo.CustomerData;
GO
-- 2.3 Validate US ZIP codes (5 digits or 5+4 format)
SELECT
CustomerID,
PostalCode,
CASE
WHEN REGEXP_LIKE(PostalCode, '^\d{5}(-\d{4})?$') = 1
THEN 'Valid US ZIP'
ELSE 'Not US ZIP'
END AS ZipStatus
FROM dbo.CustomerData;
GO
-- 2.4 Filter rows with valid product codes (PRD-YYYY-NNN format)
SELECT
CustomerID,
ProductCode
FROM dbo.CustomerData
WHERE REGEXP_LIKE(ProductCode, '^PRD-\d{4}-\d{3}$') = 1;
GO
-- 2.5 Case-insensitive matching with flags
SELECT
CustomerID,
Email
FROM dbo.CustomerData
WHERE REGEXP_LIKE(Email, 'EXAMPLE\.COM$', 'i') = 1; -- 'i' flag for case-insensitive
GO
-------------------------------------------------
-- Region: 3. REGEXP_REPLACE - Search and Replace
-------------------------------------------------
/*
REGEXP_REPLACE finds pattern occurrences and replaces them with a new string.
Supports backreferences for captured groups.
*/
-- 3.1 Normalize phone numbers - remove all non-digit characters
SELECT
CustomerID,
PhoneNumber AS OriginalPhone,
REGEXP_REPLACE(PhoneNumber, '[^0-9]', '') AS NormalizedPhone
FROM dbo.CustomerData;
GO
-- 3.2 Mask email addresses (keep first 2 chars, last domain)
SELECT
CustomerID,
Email AS OriginalEmail,
REGEXP_REPLACE(Email, '^(.{2}).*@(.*)$', '\1***@\2') AS MaskedEmail
FROM dbo.CustomerData;
GO
-- 3.3 Standardize product codes
SELECT
CustomerID,
ProductCode AS OriginalCode,
REGEXP_REPLACE(ProductCode, '^(PRD|PROD)-', 'PRODUCT-') AS StandardizedCode
FROM dbo.CustomerData;
GO
-- 3.4 Replace multiple spaces with a single space
CREATE TABLE dbo.TextSamples
(
ID INT IDENTITY(1,1) PRIMARY KEY,
TextContent NVARCHAR(200)
);
GO
INSERT INTO dbo.TextSamples (TextContent)
VALUES
('This has multiple spaces'),
('Normal spacing here'),
('Tabs and spaces mixed');
GO
SELECT
ID,
TextContent AS Original,
REGEXP_REPLACE(TextContent, '\s+', ' ') AS Cleaned
FROM dbo.TextSamples;
GO
-------------------------------------------------
-- Region: 4. REGEXP_SUBSTR - Extract Substrings
-------------------------------------------------
/*
REGEXP_SUBSTR extracts the Nth occurrence of a substring matching the pattern.
*/
-- 4.1 Extract email domain
SELECT
CustomerID,
Email,
REGEXP_SUBSTR(Email, '@(.+)$', 1, 1, NULL, 1) AS Domain
FROM dbo.CustomerData
WHERE Email IS NOT NULL;
GO
-- 4.2 Extract year from product code
SELECT
CustomerID,
ProductCode,
REGEXP_SUBSTR(ProductCode, '\d{4}') AS Year
FROM dbo.CustomerData
WHERE ProductCode IS NOT NULL;
GO
-- 4.3 Extract area code from phone number
SELECT
CustomerID,
PhoneNumber,
REGEXP_SUBSTR(PhoneNumber, '\d{3}') AS AreaCode
FROM dbo.CustomerData
WHERE PhoneNumber IS NOT NULL;
GO
-- 4.4 Extract second occurrence
INSERT INTO dbo.TextSamples (TextContent)
VALUES ('Version 2024.01.15 updated to 2025.12.25');
GO
SELECT
ID,
TextContent,
REGEXP_SUBSTR(TextContent, '\d{4}\.\d{2}\.\d{2}', 1, 1) AS FirstDate,
REGEXP_SUBSTR(TextContent, '\d{4}\.\d{2}\.\d{2}', 1, 2) AS SecondDate
FROM dbo.TextSamples
WHERE ID = (SELECT MAX(ID) FROM dbo.TextSamples);
GO
-------------------------------------------------
-- Region: 5. REGEXP_INSTR - Find Position
-------------------------------------------------
/*
REGEXP_INSTR returns the starting or ending position of a pattern match.
*/
-- 5.1 Find position of @ symbol in email
SELECT
CustomerID,
Email,
REGEXP_INSTR(Email, '@') AS AtPosition
FROM dbo.CustomerData
WHERE Email IS NOT NULL;
GO
-- 5.2 Find first digit position
SELECT
CustomerID,
ProductCode,
REGEXP_INSTR(ProductCode, '\d') AS FirstDigitPosition
FROM dbo.CustomerData
WHERE ProductCode IS NOT NULL;
GO
-- 5.3 Find ending position of pattern
SELECT
CustomerID,
Email,
REGEXP_INSTR(Email, '@[^@]+', 1, 1, 1) AS DomainEndPosition
FROM dbo.CustomerData
WHERE Email IS NOT NULL;
GO
-------------------------------------------------
-- Region: 6. REGEXP_COUNT - Count Matches
-------------------------------------------------
/*
REGEXP_COUNT returns the number of times a pattern occurs in a string.
*/
-- 6.1 Count digits in phone numbers
SELECT
CustomerID,
PhoneNumber,
REGEXP_COUNT(PhoneNumber, '\d') AS DigitCount
FROM dbo.CustomerData
WHERE PhoneNumber IS NOT NULL;
GO
-- 6.2 Count words (sequences of word characters)
SELECT
ID,
TextContent,
REGEXP_COUNT(TextContent, '\w+') AS WordCount
FROM dbo.TextSamples;
GO
-- 6.3 Count uppercase letters
SELECT
CustomerID,
Email,
REGEXP_COUNT(Email, '[A-Z]') AS UppercaseCount
FROM dbo.CustomerData
WHERE Email IS NOT NULL;
GO
-- 6.4 Count hyphens in postal codes
SELECT
CustomerID,
PostalCode,
REGEXP_COUNT(PostalCode, '-') AS HyphenCount
FROM dbo.CustomerData
WHERE PostalCode IS NOT NULL;
GO
-------------------------------------------------
-- Region: 7. REGEXP_MATCHES - Capture Groups
-------------------------------------------------
/*
REGEXP_MATCHES returns a table of captured substrings from named groups.
This is powerful for parsing structured text.
*/
-- 7.1 Parse email into username and domain
SELECT M.*
FROM dbo.CustomerData C
CROSS APPLY REGEXP_MATCHES(C.Email, '(?<username>[^@]+)@(?<domain>.+)') M
WHERE C.Email IS NOT NULL;
GO
-- 7.2 Parse product code components
SELECT
C.CustomerID,
C.ProductCode,
M.prefix,
M.year,
M.sequence
FROM dbo.CustomerData C
CROSS APPLY REGEXP_MATCHES(C.ProductCode, '(?<prefix>[A-Z]+)-(?<year>\d{4})-(?<sequence>\d+)') M
WHERE C.ProductCode LIKE 'PRD-%';
GO
-- 7.3 Parse phone number components
SELECT
C.CustomerID,
C.PhoneNumber,
M.area AS AreaCode,
M.exchange AS Exchange,
M.number AS Number
FROM dbo.CustomerData C
CROSS APPLY REGEXP_MATCHES(C.PhoneNumber, '(?<area>\d{3})[^\d]*(?<exchange>\d{3})[^\d]*(?<number>\d{4})') M
WHERE C.PhoneNumber IS NOT NULL;
GO
-------------------------------------------------
-- Region: 8. REGEXP_SPLIT_TO_TABLE - Split Strings
-------------------------------------------------
/*
REGEXP_SPLIT_TO_TABLE splits a string into rows based on a delimiter pattern.
*/
-- 8.1 Split comma-separated values
SELECT *
FROM REGEXP_SPLIT_TO_TABLE('Apple,Banana,Cherry,Date', ',');
GO
-- 8.2 Split on multiple delimiters
SELECT *
FROM REGEXP_SPLIT_TO_TABLE('Apple|Banana;Cherry,Date', '[|;,]');
GO
-- 8.3 Split on whitespace
SELECT *
FROM REGEXP_SPLIT_TO_TABLE('This has multiple spaces', '\s+');
GO
-- 8.4 Practical example: Split tags
CREATE TABLE dbo.Articles
(
ArticleID INT IDENTITY(1,1) PRIMARY KEY,
Title NVARCHAR(200),
Tags NVARCHAR(500)
);
GO
INSERT INTO dbo.Articles (Title, Tags)
VALUES
('SQL Server 2025 Features', 'sql,database,2025,features'),
('Regular Expressions Guide', 'regex,sql,tutorial'),
('Performance Tuning Tips', 'performance,optimization,sql,tips');
GO
-- Split tags into separate rows
SELECT
A.ArticleID,
A.Title,
T.value AS Tag
FROM dbo.Articles A
CROSS APPLY REGEXP_SPLIT_TO_TABLE(A.Tags, ',') T;
GO
-------------------------------------------------
-- Region: 9. Real-World Examples
-------------------------------------------------
/*
Practical use cases combining multiple regex functions.
*/
-- 9.1 Data Validation and Cleansing
CREATE TABLE dbo.UserInputs
(
InputID INT IDENTITY(1,1) PRIMARY KEY,
UserEmail NVARCHAR(100),
UserPhone NVARCHAR(50),
UserZip NVARCHAR(20)
);
GO
INSERT INTO dbo.UserInputs (UserEmail, UserPhone, UserZip)
VALUES
(' john.doe@example.com ', '(555) 123-4567', '12345'),
('JANE.SMITH@COMPANY.NET', '555.987.6543', '98765-4321'),
('invalid-email', '5551234', 'ABC');
GO
-- Validate and clean user inputs
SELECT
InputID,
LTRIM(RTRIM(LOWER(UserEmail))) AS CleanedEmail,
CASE
WHEN REGEXP_LIKE(UserEmail, '^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$') = 1
THEN 'Valid'
ELSE 'Invalid'
END AS EmailValidation,
REGEXP_REPLACE(UserPhone, '[^0-9]', '') AS CleanedPhone,
CASE
WHEN REGEXP_LIKE(UserZip, '^\d{5}(-\d{4})?$') = 1
THEN 'Valid'
ELSE 'Invalid'
END AS ZipValidation
FROM dbo.UserInputs;
GO
-- 9.2 Log Parsing
CREATE TABLE dbo.ServerLogs
(
LogID INT IDENTITY(1,1) PRIMARY KEY,
LogEntry NVARCHAR(500)
);
GO
INSERT INTO dbo.ServerLogs (LogEntry)
VALUES
('2025-12-25 10:30:45 [ERROR] Connection failed to 192.168.1.100'),
('2025-12-25 10:31:12 [INFO] User john.doe@example.com logged in'),
('2025-12-25 10:32:00 [WARNING] High memory usage: 85%');
GO
-- Parse log entries
SELECT
LogID,
LogEntry,
M.timestamp,
M.level,
M.message
FROM dbo.ServerLogs
CROSS APPLY REGEXP_MATCHES(
LogEntry,
'(?<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(?<level>[A-Z]+)\] (?<message>.*)'
) M;
GO
-- 9.3 Extract and validate URLs
CREATE TABLE dbo.WebContent
(
ContentID INT IDENTITY(1,1) PRIMARY KEY,
Content NVARCHAR(1000)
);
GO
INSERT INTO dbo.WebContent (Content)
VALUES
('Visit our site at https://www.example.com for more info'),
('Check http://blog.example.com and https://docs.example.com/guide'),
('No URLs in this text');
GO
-- Extract all URLs from content
SELECT
ContentID,
REGEXP_COUNT(Content, 'https?://[^\s]+') AS URLCount,
Content
FROM dbo.WebContent;
GO
-------------------------------------------------
-- Region: 10. Performance Considerations
-------------------------------------------------
/*
Regular expressions can be computationally expensive.
Consider these best practices:
- Use computed persisted columns for frequently used patterns
- Create indexes on computed columns
- Use simpler patterns when possible (LIKE vs REGEXP_LIKE for simple cases)
*/
-- 10.1 Create computed persisted column for email validation
ALTER TABLE dbo.CustomerData
ADD IsValidEmail AS
CASE
WHEN REGEXP_LIKE(Email, '^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$') = 1
THEN 1
ELSE 0
END PERSISTED;
GO
-- 10.2 Create index on computed column
CREATE NONCLUSTERED INDEX IX_CustomerData_ValidEmail
ON dbo.CustomerData (IsValidEmail)
WHERE IsValidEmail = 1;
GO
-- 10.3 Use the indexed column for filtering
SELECT CustomerID, Email
FROM dbo.CustomerData
WHERE IsValidEmail = 1;
GO
-------------------------------------------------
-- Region: 11. Cleanup
-------------------------------------------------
/*
Optional: Clean up the demo database.
*/
-- USE master;
-- GO
-- DROP DATABASE IF EXISTS RegexDemo;
-- GO