sql-data-cleaning-project/data_cleaning.sql at main · muthazir/sql-data-cleaning-project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
/*
Project: SQL Data Cleaning – Global Layoffs Dataset
Source: Kaggle
https://www.kaggle.com/datasets/swaptr/layoffs-2022
Objective:
- Clean raw layoff data
- Remove duplicates and invalid records
- Standardize fields for analysis

Outcome:
Cleaned dataset ready for EDA and reporting
*/


SELECT *
FROM global_layoffs.layoffs;

-- Creating a staging table to preserve raw data
-- This allows rollback and auditability during cleaning
CREATE TABLE global_layoffs.layoffs_staging
LIKE layoffs;

INSERT INTO layoffs_staging
SELECT *
FROM global_layoffs.layoffs;


-- now when we are data cleaning we usually follow a few steps
-- 1. check for duplicates and remove any
-- 2. standardize data and fix errors
-- 3. Look at null values and see what
-- 4. remove any columns and rows that are not necessary - few ways


-- 1. Remove Duplicates


-- Identifying duplicate records based on business-relevant fields
-- These columns uniquely represent a layoff event
-- ROW_NUMBER is used to retain the earliest occurrence

WITH duplicate_cte AS
(
SELECT * ,
ROW_NUMBER() OVER(
PARTITION BY company,industry,total_laid_off,percentage_laid_off,`date`,stage,funds_raised_millions,country) AS row_num
FROM layoffs_staging
)
SELECT * FROM
duplicate_cte
WHERE row_num > 1;

-- these are the ones we want to delete where the row number is > 1 or 2 or greater essentially

-- creating layoffs_staging2 same as layoffs_staging1 with an extra 'row_num' column to help delete duplicate rows (keep only 1).

CREATE TABLE `layoffs_staging2` (
  `company` text,
  `location` text,
  `industry` text,
  `total_laid_off` text,
  `percentage_laid_off` text,
  `date` text,
  `stage` text,
  `country` text,
  `funds_raised_millions` text,
  `row_num` int
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;

INSERT INTO layoffs_staging2
SELECT * ,
ROW_NUMBER() OVER(
PARTITION BY company,industry,total_laid_off,percentage_laid_off,`date`,stage,funds_raised_millions,country) AS row_num
FROM layoffs_staging;

-- Verifying duplicate count before deletion
SELECT COUNT(*) AS duplicate_rows
FROM layoffs_staging2
WHERE row_num > 1;

-- now that we have this we can delete rows where row_num > 1 (keeping one valid record)

DELETE
FROM layoffs_staging2
WHERE row_num > 1;

-- Confirming duplicates are removed
SELECT COUNT(*) AS remaining_duplicates
FROM layoffs_staging2
WHERE row_num > 1;


-- 2. Standardizing data


-- removing leading and trailing spaces from company names to ensure consistency.

SELECT company,TRIM(company)
FROM layoffs_staging2;

UPDATE layoffs_staging2
SET company = TRIM(company);


-- if we look at industry it looks like we have some null and empty rows, let's take a look at these
SELECT DISTINCT industry
FROM global_layoffs.layoffs_staging2
ORDER BY industry;

SELECT *
FROM global_layoffs.layoffs_staging2
WHERE industry IS NULL
OR industry = ''
ORDER BY industry;

-- this query is fine; no issues here.
SELECT *
FROM global_layoffs.layoffs_staging2
WHERE company LIKE 'Bally%';
-- this query is fine; no issues here.

SELECT *
FROM global_layoffs.layoffs_staging2
WHERE company LIKE 'airbnb%';

-- it looks like airbnb is a travel, but this one just isn't populated.
-- I'm sure it's the same for the others. What we can do is
-- write a query that if there is another row with the same company name, it will update it to the non-null industry values
-- makes it easy so if there were thousands we wouldn't have to manually check them all

-- we should set the blanks to nulls since those are typically easier to work with
UPDATE global_layoffs.layoffs_staging2
SET industry = NULL
WHERE industry = '';

-- now if we check those are all null

SELECT *
FROM global_layoffs.layoffs_staging2
WHERE industry IS NULL
OR industry = ''
ORDER BY industry;

-- now we need to populate those nulls if possible

UPDATE layoffs_staging2 t1
JOIN layoffs_staging2 t2
ON t1.company = t2.company
SET t1.industry = t2.industry
WHERE t1.industry IS NULL
AND t2.industry IS NOT NULL;

-- and if we check it looks like Bally's was the only one without a populated row to populate this null values
SELECT *
FROM global_layoffs.layoffs_staging2
WHERE industry IS NULL
OR industry = ''
ORDER BY industry;


-- Crypto has multiple different variations. We need to standardize that - let's set all to Crypto
SELECT DISTINCT industry
FROM global_layoffs.layoffs_staging2
ORDER BY industry;

UPDATE layoffs_staging2
SET industry = 'Crypto'
WHERE industry IN ('Crypto Currency', 'CryptoCurrency');

-- now that's taken care of:
SELECT DISTINCT industry
FROM global_layoffs.layoffs_staging2
ORDER BY industry;

-- --------------------------------------------------
-- we also need to look at

SELECT *
FROM global_layoffs.layoffs_staging2;

-- everything looks good except apparently we have some "United States" and some "United States." with a period at the end. Let's standardize this.
SELECT DISTINCT country
FROM global_layoffs.layoffs_staging2
ORDER BY country;

UPDATE layoffs_staging2
SET country = TRIM(TRAILING '.' FROM country);

-- now if we run this again it is fixed
SELECT DISTINCT country
FROM global_layoffs.layoffs_staging2
ORDER BY country;


-- Let's also fix the date columns:
SELECT *
FROM global_layoffs.layoffs_staging2;

-- we can use str to date to update this field
UPDATE layoffs_staging2
SET `date` = STR_TO_DATE(`date`, '%m/%d/%Y');

-- now we can convert the data type properly
ALTER TABLE layoffs_staging2
MODIFY COLUMN `date` DATE;


SELECT *
FROM global_layoffs_layoffs.layoffs_staging2;


-- 3. Look at Null Values

-- NULLs in total_laid_off, percentage_laid_off, and funds_raised_millions seem appropriate.
-- keeping them as NULL is useful for accurate calculations during the EDA phase.

-- no changes needed for NULL values at this point.


-- 4. remove any columns and rows we need to

SELECT *
FROM global_layoffs.layoffs_staging2
WHERE total_laid_off IS NULL;


SELECT *
FROM global_layoffs.layoffs_staging2
WHERE total_laid_off IS NULL
AND percentage_laid_off IS NULL;

-- Delete Useless data we can't really use
DELETE FROM global_layoffs.layoffs_staging2
WHERE total_laid_off IS NULL
AND percentage_laid_off IS NULL;

SELECT *
FROM global_layoffs.layoffs_staging2;

ALTER TABLE layoffs_staging2
DROP COLUMN row_num;

-- Final cleaned dataset ready for analysis
-- All duplicates removed, formats standardized, and invalid records excluded
SELECT *
FROM global_layoffs.layoffs_staging2;


-- High-level dataset summary
SELECT
  COUNT(*) AS total_records,
  COUNT(DISTINCT company) AS unique_companies
FROM layoffs_staging2;