[Azure] Performance troubleshooting in Azure SQL Database

## Useful queries

```sql
-- check current running queries
SELECT 
    req.session_id
    , req.total_elapsed_time AS duration_ms
    , req.total_elapsed_time/ (1000 * 60) AS 'duration_ms(min)'
    , req.cpu_time AS cpu_time_ms
    , req.cpu_time/ (1000 * 60) AS 'cpu_tim(min)'
    , req.wait_time AS current_wait_time
    , req.wait_time/ (1000 * 60) AS 'current_wait_time(min)'
    , req.total_elapsed_time - req.cpu_time AS minus_wait_time
    , req.logical_reads
    , SUBSTRING (REPLACE (REPLACE (SUBSTRING (ST.text, (req.statement_start_offset/2) + 1, 
       ((CASE statement_end_offset
           WHEN -1
           THEN DATALENGTH(ST.text)  
           ELSE req.statement_end_offset
         END - req.statement_start_offset)/2) + 1) , CHAR(10), ' '), CHAR(13), ' '), 
      1, 512)  AS statement_text
FROM sys.dm_exec_requests AS req
    CROSS APPLY sys.dm_exec_sql_text(req.sql_handle) AS ST
ORDER BY total_elapsed_time DESC;

-- overall check runtime stat
SELECT qsq.query_id, qsq.query_hash, qst.query_sql_text, qrs.execution_type, qrs.execution_type_desc, qpx.query_plan_xml, qrs.count_executions, qrs.last_execution_time
FROM sys.query_store_query AS qsq
JOIN sys.query_store_plan AS qsp on qsq.query_id=qsp.query_id
JOIN sys.query_store_query_text AS qst on qsq.query_text_id=qst.query_text_id
OUTER APPLY (SELECT TRY_CONVERT(XML, qsp.query_plan) AS query_plan_xml) AS qpx
JOIN sys.query_store_runtime_stats qrs on qsp.plan_id = qrs.plan_id
ORDER BY qrs.last_execution_time DESC;
GO

-- get query ID/hash from statement keyword
SELECT 
    qsq.query_id,
    qsq.query_hash,
    qsq.last_execution_time,
    qsqt.query_sql_text
FROM sys.query_store_query qsq
    INNER JOIN sys.query_store_query_text qsqt
        ON qsq.query_text_id = qsqt.query_text_id
WHERE
    qsqt.query_sql_text LIKE '%SELECT%'; -- modify
		
		
-- Get query text from query ID
SELECT q.query_id, qt.query_sql_text, q.query_hash
FROM sys.query_store_query_text AS qt
JOIN sys.query_store_query AS q
ON qt.query_text_id = q.query_text_id
where q.query_id = <QueryID> -- modify

-- check sepcific query
SELECT Txt.query_text_id, Txt.query_sql_text, Run.*
FROM sys.query_store_plan AS Pl
INNER JOIN sys.query_store_query AS Qry
ON Pl.query_id = Qry.query_id
INNER JOIN sys.query_store_query_text AS Txt
    ON Qry.query_text_id = Txt.query_text_id
INNER JOIN sys.query_store_runtime_stats AS Run
    on Run.plan_id = Pl.plan_id
WHERE Qry.query_id = <QueryID> -- modify

-- check execution numbers
SELECT
    YEAR(start_time) AS 'year', MONTH(start_time) AS 'month', DAY(start_time) AS 'day', DATEPART(hour, start_time) AS 'hour',
    SUM (rs.count_executions) AS 'execution_count',
    SUM(CASE WHEN rs.execution_type_desc='Regular' THEN count_executions ELSE 0 END) AS 'regular_execution_count',
    SUM(CASE WHEN rs.execution_type_desc='Aborted' THEN count_executions ELSE 0 END) AS 'aborted_execution_count',
    SUM(CASE WHEN rs.execution_type_desc='Exception' THEN count_executions ELSE 0 END) AS 'exception_execution_count',
    ROUND(CONVERT(float, SUM(rs.avg_cpu_time * rs.count_executions)) / NULLIF(SUM(rs.count_executions), 0) * 0.001, 0) AS 'avg_cpu_time',
    ROUND(CONVERT(float, SUM(rs.avg_duration * rs.count_executions)) / NULLIF(SUM(rs.count_executions), 0) * 0.001, 0) AS 'avg_duration',
    ROUND(CONVERT(float, SUM(rs.avg_logical_io_reads * rs.count_executions)) / NULLIF(SUM(rs.count_executions), 0) * 8, 0) AS 'avg_logical_io_reads'
FROM sys.query_store_runtime_stats rs (NOLOCK)
INNER JOIN sys.query_store_runtime_stats_interval i ON rs.runtime_stats_interval_id = i.runtime_stats_interval_id
JOIN sys.query_store_plan AS qp ON rs.plan_id = qp.plan_id
JOIN sys.query_store_query AS qq ON qp.query_id = qq.query_id
-- WHERE qq.query_id = 1234
-- limit to time window as appropriate:
-- WHERE start_time >= '2023-01-15 09:00:00' AND start_time <= '2023-01-18 18:00:00' 
GROUP BY YEAR(start_time), MONTH(start_time), DAY(start_time), DATEPART(hour, start_time)
ORDER BY YEAR(start_time), MONTH(start_time), DAY(start_time), DATEPART(hour, start_time);

-- check for stored procedure
-- https://learn.microsoft.com/en-us/sql/relational-databases/in-memory-oltp/monitoring-performance-of-natively-compiled-stored-procedures?view=sql-server-ver17#sample-queries
SELECT object_id, object_name(object_id) AS 'object name',
       cached_time, last_execution_time, execution_count,
       total_worker_time, last_worker_time,
       min_worker_time, max_worker_time,
       total_elapsed_time, last_elapsed_time,
       min_elapsed_time, max_elapsed_time
FROM sys.dm_exec_procedure_stats
WHERE database_id = DB_ID()

-- check timed out(aborted) queries
SELECT
  qst.query_sql_text,
  qrs.execution_type,
  qrs.execution_type_desc,
  qpx.query_plan_xml,
  qrs.count_executions,
  qrs.last_execution_time
FROM sys.query_store_query AS qsq
JOIN sys.query_store_plan AS qsp
  ON qsp.query_id = qsq.query_id
JOIN sys.query_store_query_text AS qst
  ON qst.query_text_id = qsq.query_text_id
OUTER APPLY
(
    SELECT
        query_plan_xml = TRY_CONVERT(XML, qsp.query_plan)
) AS qpx
JOIN sys.query_store_runtime_stats AS qrs
  ON qrs.plan_id = qsp.plan_id
WHERE qrs.execution_type = 3
ORDER BY
    qrs.last_execution_time DESC;
GO
 
```

## Useful link

General guide 
- [Troubleshoot slow-running queries - SQL Server | Microsoft Learn](https://learn.microsoft.com/en-us/troubleshoot/sql/database-engine/performance/troubleshoot-slow-running-queries#introduction)
- [Troubleshoot performance issues with Intelligent Insights - Azure SQL Database & Azure SQL Managed Instance | Microsoft Learn](https://learn.microsoft.com/en-us/azure/azure-sql/database/intelligent-insights-troubleshoot-performance?view=azuresql)
-[Performance Tuning Guidance for Applications and Databases - Azure SQL Database | Microsoft Learn](https://learn.microsoft.com/en-us/azure/azure-sql/database/performance-guidance?view=azuresql)

Query Store
- [Use the Regressed Queries feature](https://learn.microsoft.com/en-us/sql/relational-databases/performance/monitoring-performance-by-using-the-query-store?view=sql-server-ver17#use-the-regressed-queries-feature)

identify bottleneck:
- [Detectable types of query performance bottlenecks - Azure SQL Database | Microsoft Learn](https://learn.microsoft.com/en-us/azure/azure-sql/database/identify-query-performance-issues?view=azuresql&source=recommendations)
- high CPU: [Diagnose and Troubleshoot High CPU - Azure SQL Database | Microsoft Learn](https://learn.microsoft.com/en-us/azure/azure-sql/database/high-cpu-diagnose-troubleshoot?view=azuresql)

### CPU bound

[Troubleshoot high-CPU-usage issues in SQL Server - SQL Server | Microsoft Learn](https://learn.microsoft.com/en-us/troubleshoot/sql/database-engine/performance/troubleshoot-high-cpu-usage-issues)

Although there are many possible causes of high CPU usage that occur in SQL Server, the following ones are the most common causes:
- High logical reads that are caused by table or index scans because of the following conditions:
   - Out-of-date statistics
   - Missing indexes
   - Parameter sensitive plan (PSP) issues
   - Poorly designed queries
- Increase in workload

#### About MAXDOP

[Thread and task architecture guide - SQL Server | Microsoft Learn](https://learn.microsoft.com/en-us/sql/relational-databases/thread-and-task-architecture-guide?view=sql-server-ver17#workers)
The max degree of parallelism (MAXDOP) limit is set per task, not per request. This means that during a parallel query execution, a single request can spawn multiple tasks up to the MAXDOP limit, and each task will use one worker. 
The number of worker threads spawned for each task depends on:
- Whether the request was eligible for parallelism as determined by the Query Optimizer.
- What the actual available degree of parallelism (DOP) in the system is, based on current load. This may differ from estimated DOP, which is based on the server configuration for max degree of parallelism (MAXDOP). For example, the server configuration for MAXDOP may be 8 but the available DOP at runtime can be only 2, which affects query performance. Memory pressure and lack of workers are two conditions which reduce available DOP at runtime.

```sql
-- Check for DOP values in Query Store 
SELECT q.query_id,  q.query_sql_text,  qs.count_executions,  qs.avg_dop,  qs.max_dop  
FROM sys.query_store_query q  
JOIN sys.query_store_query_text qt ON q.query_text_id = qt.query_text_id  
JOIN sys.query_store_plan qsp ON q.query_id = qsp.query_id  
JOIN sys.query_store_runtime_stats qs ON qsp.plan_id = qs.plan_id
-- WHERE q.query_id = 
ORDER BY  qs.max_dop DESC;

-- Check for DOP values in DMV
SELECT  
    qs.execution_count,  
    qs.total_worker_time / qs.execution_count AS avg_worker_time,  
    qs.max_dop,  -- MAXDOP used
    st.text AS sql_text  
FROM sys.dm_exec_query_stats qs  
CROSS APPLY sys.dm_exec_sql_text(qs.sql_handle) st  
ORDER BY qs.total_worker_time DESC;
```

### I/O Bound

Data IO happens when the database engine must fetch data from the disk. High disk reads can result from cache eviction, which often occurs after a database restart or under memory pressure from heavy workloads, or from query-related issues such as plan regression.

### Plan analysis: optimizer timeouts

Recently I handled a query performance issue which is quite interesting. The query is good when it retrieves several months data. But when set the parameter to retrieve one month data, the performance degradated a lot even with less data.

We found that it's using different plans. The operations are quite similar, and the overall time took on each operation in bad plan is higher than the good plan, no apparent bottleneck on specific operation. The most suspicious point is that the "Reason for early termination of Statement Optimization" marked "Timed out" in bad plan. There are warnings for type conversion and missing index in both plans.

This blog shared insights on optimizer timeouts:  [Understanding Optimizer Timeout and how Complex queries can be Affected in SQL Server | Microsoft Community Hub](https://techcommunity.microsoft.com/blog/sqlserversupport/understanding-optimizer-timeout-and-how-complex-queries-can-be-affected-in-sql-s/319188)

The query itself is a very complex query and involves lots of joined tables. As mentioned in the blog above, **given example 10 tables to join, the possible permutations are in millions.** Therefore, we can see that a query with lots of joins is more likely to reach the optimizer timeout threshold than one with fewer joins.

I recommended the following quick mitigations and long-run improvement actions.
- Update statistics
[UPDATE STATISTICS (Transact-SQL) - SQL Server | Microsoft Learn](https://learn.microsoft.com/en-us/sql/t-sql/statements/update-statistics-transact-sql?view=sql-server-ver17#sample-number--percent--rows%2D)
> For most workloads, a full scan isn't required, and default sampling is adequate. However, certain workloads that are sensitive to widely varying data distributions might require an increased sample size, or even a full scan.

`UPDATE STATISTICS Tablename WITH FULLSCAN;`

- Add query hint `OPTION(RECOMPILE)` to recompile the query and see if improves.
- Force a good plan with [sp_query_store_force_plan (Transact-SQL) - SQL Server | Microsoft Learn](https://learn.microsoft.com/en-us/sql/relational-databases/system-stored-procedures/sp-query-store-force-plan-transact-sql?view=sql-server-ver17).

Improvements for the long run:
- Review the warnings for type conversion and missing index and determine whether corrective actions are needed.
- Rewrite the query to reduce complexity and reduce joins.

### Timeouts

Useful links
- [Troubleshoot query time-out errors](https://learn.microsoft.com/en-us/troubleshoot/sql/database-engine/performance/troubleshoot-query-timeouts)
- [QTip: Getting timeouts and exceptions using Azure SQL DB Query Store](https://techcommunity.microsoft.com/blog/azuredbsupport/qtip-getting-timeouts-and-exceptions-using-azure-sql-db-query-store/4427310)

```sql
declare @datestart as datetime = dateadd(D,-2,getdate());
declare @datefinish as datetime = getdate();

/* if you want to set to specific time */
--set @datestart = '2025-04-09 00:00:00';
--set @datefinish = '2025-04-09 23:59:59';

select rs.last_execution_time, rs.execution_type_desc, qt.query_sql_text, q.query_id, CONVERT(VARCHAR(1000), q.query_hash, 1) as strqueryhash, p.plan_id, 
rs.last_cpu_time, rs.last_duration, rs.count_executions, rs.last_rowcount,
rs.last_logical_io_reads, rs.last_physical_io_reads, rs.last_query_max_used_memory, rs.last_tempdb_space_used, rs.last_dop,
p.is_forced_plan, p.last_force_failure_reason, p.last_force_failure_reason_desc
FROM sys.query_store_query_text AS qt
JOIN sys.query_store_query AS q
ON qt.query_text_id = q.query_text_id
JOIN sys.query_store_plan AS p
ON q.query_id = p.query_id
JOIN sys.query_store_runtime_stats AS rs
ON p.plan_id = rs.plan_id
where rs.last_execution_time>= @datestart and rs.last_execution_time<=@datefinish 
and (rs.execution_type=3 or rs.execution_type=4)  -- 3 timeout, 4 error
--and  qt.query_sql_text like '%actual%'
--and q.query_hash=0x009C458D20394C37
--and p.plan_id=12
ORDER BY rs.last_execution_time DESC
```

### Compilation timeouts

If we found query/command timed out in audit logs, but there is no matching records in DMV and query store,  it could be compilation timeouts. When an application reaches its configured command timeout during the compilation phase, the query never executes and therefore produces no runtime statistics. Query Store and DMVs primarily captures runtime execution data, so it may not record the “timed out” attempt.

A known long compilation case: [Improving Performance with AUTO_UPDATE_STATISTICS_ASYNC](https://techcommunity.microsoft.com/blog/azuredbsupport/improving-performance-with-auto-update-statistics-async/3579907)

### Plan regression due to statistics changes

同じクエリハッシュが複数のプランハッシュにコンパイルされ、論理読み取りや経過時間に大きな変動がある事象が発生しました。統計が変更されて再コンパイルされた場合、クエリオプティマイザーは変化した時点から統計を使用します。ただし、カーディナリティ推定が異なる結果を導き、最適化されていない実行プランが選択される可能性がございます。

以下は、よく利用される対策をまとめた内容です。公式ドキュメント[検出可能なクエリのパフォーマンス ボトルネックの種類 - Azure SQL Database | Microsoft Learn](https://learn.microsoft.com/ja-jp/azure/azure-sql/database/identify-query-performance-issues?view=azuresql#compilation-problems-resulting-in-a-suboptimal-query-plan) を併せてご参照ください。
 
・統計とインデックスの管理

主なテーブルについて統計を再更新し、カーディナリティ推定を正確化してください。

`UPDATE STATISTICS <table> WITH FULLSCAN;`
 
インデックスの断片化確認と再構築

[インデックスを最適に維持してパフォーマンスを向上させ、リソース使用率を削減する - SQL Server | Microsoft Learn](https://learn.microsoft.com/ja-jp/sql/relational-databases/indexes/reorganize-and-rebuild-indexes?view=sql-server-ver17)
 
・最適なプランの強制
クエリストアを使用して、過去の最適なプランを特定し、該当プランを強制することでパフォーマンスの安定化を図ります。
以下の手順をご参考ください。

```sql
-- クエリIDおよびプランハッシュを取得
SELECT q.query_id, p.plan_id, p.query_plan_hash,
       SUM(rs.count_executions * rs.avg_duration) AS total_duration_ms,
       SUM(rs.count_executions * rs.avg_cpu_time) AS total_cpu_ms
FROM sys.query_store_query AS q
JOIN sys.query_store_plan  AS p ON p.query_id = q.query_id
JOIN sys.query_store_runtime_stats AS rs ON rs.plan_id = p.plan_id
WHERE q.query_hash = xxxxx
GROUP BY q.query_id, p.plan_id, p.query_plan_hash
ORDER BY total_duration_ms;
 
-- 最適なプランを強制
EXEC sp_query_store_force_plan @query_id = <query_id>, @plan_id = <plan_id>;
  ```
	
・不足しているインデックスの特定
[インデックス候補が見つからない非クラスター化インデックスを調整する - SQL Server | Microsoft Learn](https://learn.microsoft.com/ja-jp/sql/relational-databases/indexes/tune-nonclustered-missing-index-suggestions?view=sql-server-ver17#view-missing-index-recommendations)
 
・クエリステートメントの最適化
下記のリンクはSELECT ステートメントの処理で説明した基本的な手順ですが、INSERT、UPDATE、DELETE などの他の Transact-SQL ステートメントにも適用されます。 
[クエリ処理アーキテクチャガイド - SQL Server | Microsoft Learn](https://learn.microsoft.com/en-us/sql/relational-databases/query-processing-architecture-guide?view=sql-server-ver17#optimizing-select-statements)

## Index Maintenance

[Maintaining Indexes Optimally to Improve Performance and Reduce Resource Utilization - SQL Server | Microsoft Learn](https://learn.microsoft.com/en-us/sql/relational-databases/indexes/reorganize-and-rebuild-indexes?view=azuresqldb-current&preserve-view=true#index-maintenance-strategy)

Strategy:
- Rebuild only large, highly fragmented, frequently used indexes 
- Reorganize medium fragmentation indexes. Reorganize is online, lighter, and often sufficient post‑shrink.

Fragmentation < 5% Ignore; Fragmentation < 30% REORGANIZE; Fragmentation >= 30% REBUILD

Check index fragmentation.

```tql
SELECT OBJECT_SCHEMA_NAME(ips.object_id) AS schema_name,
       OBJECT_NAME(ips.object_id) AS object_name,
       i.name AS index_name,
       o.name AS table_name,
       i.type_desc AS index_type,
       ips.avg_page_space_used_in_percent,
       ips.avg_fragmentation_in_percent,
       ips.page_count,
       ips.alloc_unit_type_desc,
       ips.ghost_record_count
FROM sys.dm_db_index_physical_stats(DB_ID(), default, default, default, 'SAMPLED') AS ips
INNER JOIN sys.indexes AS i
ON ips.object_id = i.object_id
   AND ips.index_id = i.index_id
JOIN sys.objects o ON i.object_id = o.object_id
WHERE
    ips.page_count >= 1000         -- Fragmentation on small indexes is usually irrelevant
    AND ips.avg_fragmentation_in_percent >= 30 -- Below this, rebuild benefit is marginal
ORDER BY page_count DESC;
```

Check whether index is frequently used. 
Please note sys.dm_db_index_usage_stats will be reset after restart.

```tql
-- sys.dm_db_index_usage_stats - verity if Index is frequently used since last restart.
-- user_seeks → highly valuable (point/range lookups)
-- user_scans → sequential reads (for reporting/ETL)
-- user_lookups → key lookups from nonclustered indexes
-- Indexes with only user_updates but no reads are maintained but not used.
SELECT
    s.name AS schema_name,
    o.name AS table_name,
    i.name AS index_name,
    u.user_seeks,
    u.user_scans,
    u.user_lookups,
    u.user_updates,
    (u.user_seeks + u.user_scans + u.user_lookups) AS total_reads
FROM sys.dm_db_index_usage_stats u
JOIN sys.indexes i
    ON u.object_id = i.object_id
   AND u.index_id = i.index_id
JOIN sys.objects o
    ON i.object_id = o.object_id
JOIN sys.schemas s
    ON o.schema_id = s.schema_id
WHERE
    u.database_id = DB_ID()
ORDER BY total_reads DESC;

-- check when stats reset 
SELECT sqlserver_start_time
FROM sys.dm_os_sys_info;
```

```tql
-- 1️⃣ Index REORGANIZE (preferred for medium fragmentation)

ALTER INDEX IX_Sales_OrderDate ON dbo.Sales REORGANIZE;
ALTER INDEX ALL ON dbo.Sales REORGANIZE;

-- 2️⃣ Index REBUILD (for high fragmentation)
-- ONLINE = ON → avoids long blocking
-- SORT_IN_TEMPDB = ON → faster if tempdb has space
-- DATA_COMPRESSION → optional, include only if already standardized
ALTER INDEX IX_Sales_OrderDate ON dbo.Sales REBUILD
WITH (
    ONLINE = ON,
    SORT_IN_TEMPDB = ON,
    DATA_COMPRESSION = PAGE
);

ALTER INDEX ALL ON dbo.Sales REBUILD WITH (ONLINE = ON);
```

- Dynamic best‑practice script (REBUILD vs REORGANIZE automatically)

```tql

SET NOCOUNT ON;

DECLARE @MinPageCount int = 1000;       -- adjust: ignore small indexes
DECLARE @ReorgFrom float = 5.0;         -- >=5%  -> reorganize
DECLARE @RebuildFrom float = 30.0;      -- >=30% -> rebuild
DECLARE @mode nvarchar(10) = N'SAMPLED';-- SAMPLED/LIMITED/DETAILED

-- Collect candidate indexes + fragmentation
IF OBJECT_ID('tempdb..#idx') IS NOT NULL DROP TABLE #idx;

SELECT
    s.name  AS schema_name,
    o.name  AS table_name,
    i.index_id,
    i.name  AS index_name,              -- NULL for heaps (index_id=0)
    ips.avg_fragmentation_in_percent AS frag_pct,
    ips.page_count,
    ips.index_type_desc,
    ips.alloc_unit_type_desc
INTO #idx
FROM sys.dm_db_index_physical_stats(DB_ID(), NULL, NULL, NULL, @mode) ips
JOIN sys.objects  o ON ips.object_id = o.object_id
JOIN sys.schemas  s ON o.schema_id = s.schema_id
LEFT JOIN sys.indexes i
    ON ips.object_id = i.object_id AND ips.index_id = i.index_id
WHERE
    o.type = 'U'
    AND ips.index_level = 0                      -- leaf level only
    AND ips.alloc_unit_type_desc = 'IN_ROW_DATA' -- optional: focus on rowstore in-row
    AND ips.page_count >= @MinPageCount;

-- Build commands (NULL-safe)
DECLARE @sql nvarchar(max) = N'';

;WITH cmd AS
(
    SELECT
        schema_name, table_name, index_id, index_name,
        frag_pct, page_count, index_type_desc,
        Command =
            CASE
                WHEN index_id = 0 THEN
                    N'-- HEAP (no index): ' + QUOTENAME(schema_name) + N'.' + QUOTENAME(table_name)
                WHEN frag_pct >= @RebuildFrom THEN
                    N'ALTER INDEX ' + QUOTENAME(index_name) +
                    N' ON ' + QUOTENAME(schema_name) + N'.' + QUOTENAME(table_name) +
                    N' REBUILD WITH (ONLINE = ON);'
                WHEN frag_pct >= @ReorgFrom THEN
                    N'ALTER INDEX ' + QUOTENAME(index_name) +
                    N' ON ' + QUOTENAME(schema_name) + N'.' + QUOTENAME(table_name) +
                    N' REORGANIZE;'
                ELSE NULL
            END
    FROM #idx
)
SELECT
    @sql = @sql + COALESCE(Command, N'') + CHAR(13) + CHAR(10)
FROM cmd
WHERE Command IS NOT NULL
  AND Command NOT LIKE N'-- HEAP%';  -- only include actual ALTER INDEX commands

------------------------------------------------------------
-- 1) Report: indexes that got a generated command (best for you)
------------------------------------------------------------
;WITH cmd AS
(
    SELECT
        schema_name, table_name, index_id, index_name,
        frag_pct, page_count, index_type_desc,
        GeneratedCommand =
            CASE
                WHEN index_id = 0 THEN NULL
                WHEN frag_pct >= @RebuildFrom THEN
                    N'ALTER INDEX ' + QUOTENAME(index_name) +
                    N' ON ' + QUOTENAME(schema_name) + N'.' + QUOTENAME(table_name) +
                    N' REBUILD WITH (ONLINE = ON);'
                WHEN frag_pct >= @ReorgFrom THEN
                    N'ALTER INDEX ' + QUOTENAME(index_name) +
                    N' ON ' + QUOTENAME(schema_name) + N'.' + QUOTENAME(table_name) +
                    N' REORGANIZE;'
                ELSE NULL
            END
    FROM #idx
)
SELECT
    schema_name,
    table_name,
    index_name,
    index_type_desc,
    frag_pct,
    page_count,
    GeneratedCommand
FROM cmd
WHERE GeneratedCommand IS NOT NULL
ORDER BY frag_pct DESC, page_count DESC;

------------------------------------------------------------
```

- sample report

| Schema Name | Table Name  | Index Name                         | Index Type        | Fragmentation (%) | Page Count | Generated Command |
|-------------|-------------|------------------------------------|-------------------|-------------------|------------|-------------------|
| dbo         | LargeTable  | PK__LargeTab__3214EC27B748AD4C     | CLUSTERED INDEX   | 50.02             | 7665       | `ALTER INDEX [PK__LargeTab__3214EC27B748AD4C] ON [dbo].[LargeTable] REBUILD WITH (ONLINE = ON);` |

## Shrink database

The allocated data space increases automatically but does not decrease after data is deleted.
As a result, although the used data space is reduced, the allocated data space may remain high.
Reference: https://learn.microsoft.com/en-us/azure/azure-sql/database/file-space-manage?view=azuresql-db#understanding-types-of-storage-space-for-a-database

- Reclaim unused allocated space: https://learn.microsoft.com/en-us/azure/azure-sql/database/file-space-manage?view=azuresql-db#reclaim-unused-allocated-space
- Best practices to shrink large database:  https://learn.microsoft.com/en-us/azure/azure-sql/database/file-space-manage?view=azuresql-db#shrink-large-databases

### Considerations

- The shrink process should be ideally run at a quiet time of day.
- A database shrink operation to reclaim unused allocated space is a CPU and IO resource-intensive process that can take significant time to complete. If the space is hundreds of gigabyte, this process can run for many hours or even days. However, it's possible to stop shrink command at any point and any completed work is preserved.
- If the CPU, Data IO, or Log IO utilization is (or near) 100%, scale the database up to obtain more CPU cores and increased IO throughput. The service objective of the SQL database should also be taken into consideration. Business critical or Premium service tiers will perform better than General purpose or Standard service tiers as they use local SSD storage for the data files and provide higher IOPS, throughput and low I/O latency. Therefore consider using a Business critical or Premium service tiers if shrinking is taking longer than desired.
- After a shrink operation is completed against data files, indexes might become fragmented. If performance degradation occurs after the shrink operation is complete, consider index maintenance to rebuild indexes.
- It's possible to use WAIT_AT_LOW_PRIORITY option make shrink wait at low priority and avoid affecting customer workloads.

https://learn.microsoft.com/en-us/sql/relational-databases/databases/shrink-a-database?view=sql-server-ver16#remarks
https://learn.microsoft.com/en-us/sql/t-sql/database-console-commands/dbcc-shrinkfile-transact-sql?view=sql-server-ver16#wait_at_low_priority
When a shrink command is executed in WAIT_AT_LOW_PRIORITY mode, new queries requiring schema stability (Sch-S) locks are not blocked by the waiting shrink operation until the shrink operation stops waiting and starts executing. The shrink operation executes when it is able to obtain a schema modify lock (Sch-M) lock. If a new shrink operation in WAIT_AT_LOW_PRIORITY mode cannot obtain a lock due to a long-running query, the shrink operation will eventually time out after 1 minute by default and exit with no error.

```tql
DBCC SHRINKFILE ('data_0') WITH WAIT_AT_LOW_PRIORITY (ABORT_AFTER_WAIT = SELF)
```

- There is no guarantee on the completion time. It's highly recommended to simulate the shrink operatio to estimate the time required before performing it in the production environment. Typically, users can restore a database from backup files to replicate the same space conditions without impacting the original database.

### Best practice

Best practice to shrink large databases: https://learn.microsoft.com/en-us/azure/azure-sql/database/file-space-manage?view=azuresql-db#shrink-large-databases
- Capture current usage of each database file and revisiting it after every operation. 
- If there are indexes with high page count that have **page density** lower than 60-70%, consider rebuilding or reorganizing these indexes before shrinking data files. [Link to section](https://learn.microsoft.com/en-us/azure/azure-sql/database/file-space-manage?view=azuresql-db#evaluate-index-page-density). 
> Please note this guidance is about page density, not logical fragmentation. Fragmentation = pages out of logical order (impacts I/O patterns); Page density (page fullness) = how full pages are (impacts space usage). Low page density means: lots of half‑empty page;the index consumes more space than necessary; shrink has little free space inside objects to reclaim; Rebuilding increases page density by compacting pages.

Incremental shrink:
- It is recommended to first execute shrink for each data file with the TRUNCATEONLY parameter. This way, if there is any allocated but unused space at the end of the file, it is removed quickly and without any data movement. The following sample command truncates data file with file_id 4: DBCC SHRINKFILE (4, TRUNCATEONLY); 
- If the data file size after executing the DBCC SHRINKFILE command using the TRUNCATEONLY parameter remains larger than wanted, shrink the file to a target size(e.g. 52,000MB):  DBCC SHRINKFILE (4, 52000);
If a workload is running concurrently with shrink, it might start using the storage space freed by shrink before shrink completes and truncates the file. In this case, shrink will not be able to reduce allocated space to the specified target.
- To mitigate this, set the target that is slightly smaller than the current allocated space for the file. For example, if allocated space for file is 200,000 MB, and you want to shrink it to 100,000 MB, you can first set the target to 170,000 MB. Once this command completes, it has truncated the file and reduced its allocated size to 170,000 MB. You can then repeat this command, setting target first to 140,000 MB, then to 110,000 MB, and so forth, until the file is shrunk to the desired size.
- As shrink with data movement is a long-running process. If the database has multiple data files, you can speed up the process by shrinking multiple data files in parallel.

```tql
-- To list the used, allocated and max storage space for each file:
SELECT file_id,
       name,
       CAST(FILEPROPERTY(name, 'SpaceUsed') AS bigint) * 8 / 1024. AS space_used_mb,
       CAST(size AS bigint) * 8 / 1024. AS space_allocated_mb,
       CAST(max_size AS bigint) * 8 / 1024. AS max_file_size_mb
FROM sys.database_files
WHERE type_desc IN ('ROWS','LOG');

-- Shrinks database data file named 'data_0' by removing all unused at the end of the file (if any)
DBCC SHRINKFILE ('data_0', TRUNCATEONLY);

-- Incrementally shrink to reclaim unused allocated space

DBCC SHRINKFILE ('data_0', 17000);  -- Shrinks database data file to 17000Mb
DBCC SHRINKFILE ('data_0', 14000); -- Shrinks database data file to 14000Mb
DBCC SHRINKFILE ('data_0', 11000); -- Shrinks database data file to 11000Mb
DBCC SHRINKFILE ('data_0'); -- Shrinks database data file to to the minimum possible

-- Track the shrink progress
select
a.session_id
, command
, b.text
, percent_complete
, done_in_minutes = a.estimated_completion_time / 1000 / 60
, min_in_progress = DATEDIFF(MI, a.start_time, DATEADD(ms, a.estimated_completion_time, GETDATE() ))
, a.start_time
, estimated_completion_time = DATEADD(ms, a.estimated_completion_time, GETDATE() )
from sys.dm_exec_requests a
CROSS APPLY sys.dm_exec_sql_text(a.sql_handle) b
where command like '%dbcc%'
```

After a shrink operation is completed against data files, indexes might become fragmented. This reduces their performance optimization effectiveness for certain workloads, such as queries using large scans. If performance degradation occurs after the shrink operation is complete, consider index maintenance to rebuild indexes.