I have some C# code that differs in one line only.
The resulting Asm differs in 4 lines. 1 is a duplicate but in a different location, while 3 are removed entirely.
I would expect the shorter code to run faster but I'm seeing a significant slowdown instead.
Please ignore the fact that there are a lot of things that can still be optimized. This is for an article I'm writing where I optimize it step by step and I was not expecting this intermediate result to be a regression.
Here is what I put into Sharplab.io to get Asm output. It matches what BenchmarkDotNet's disassembler produces:
using System;
using System.Collections.Generic;
using System.Text;
namespace PerformanceTesting
{
public readonly struct SuggestItem
{
public readonly string Term;
public readonly ulong Count;
public SuggestItem(string term, ulong count)
{
Term = term;
Count = count;
}
}
public sealed class CompressedSparseRowGraph
{
public readonly char[] EdgeCharacters;
public readonly uint[] FirstChildEdgeIndex;
public readonly int[] EdgeToNodeIndex;
public readonly int RootNodeIndex;
public readonly ushort[] ReachableTerminalNodes;
public readonly ulong[] WordCounts;
}
public sealed class Dawg
{
private readonly CompressedSparseRowGraph _graph;
public Dawg(CompressedSparseRowGraph graph)
{
_graph = graph;
}
private readonly struct ClosureVariable
{
public ClosureVariable(string word, uint maxEdits, CompressedSparseRowGraph graph, int[][] matrix, StringBuilder builder, List<SuggestItem> results) : this()
{
this.word = word;
this.maxEdits = maxEdits;
this.graph = graph;
this.matrix = matrix;
this.builder = builder;
this.results = results;
}
public readonly string word;
public readonly uint maxEdits;
public readonly CompressedSparseRowGraph graph;
public readonly int[][] matrix;
public readonly StringBuilder builder;
public readonly List<SuggestItem> results;
}
public IEnumerable<SuggestItem> Lookup(string word, uint maxEdits)
{
var builder = new StringBuilder(word.Length + (int)maxEdits);
builder.Append(new string(' ', word.Length + (int)maxEdits));
var results = new List<SuggestItem>();
var matrix = new int[word.Length + maxEdits + 1][];
for (var i = 0; i < matrix.Length; i++)
{
matrix[i] = new int[word.Length + 1];
matrix[i][0] = i - (int)maxEdits;
var stripeEnd = i + maxEdits + 1;
if (stripeEnd <= word.Length)
{
matrix[i][stripeEnd] = 0;
}
}
for (var i = 0; i < matrix[0].Length; i++)
{
matrix[0][i] = i - (int)maxEdits;
}
var closure = new ClosureVariable(word, maxEdits, _graph, matrix, builder, results);
Recurse(_graph.RootNodeIndex, 0, ref closure);
return results;
}
private static void Recurse(int currentNode, int depth, ref ClosureVariable closure)
{
if (depth == closure.word.Length + closure.maxEdits)
{
return;
}
var firstChild = closure.graph.FirstChildEdgeIndex[currentNode];
var lastChild = closure.graph.FirstChildEdgeIndex[currentNode + 1];
var from = depth - (int)closure.maxEdits;
if (from < 0)
{
from = 0;
}
from++;
- var to = Math.Min(closure.word.Length + 1, depth + closure.maxEdits + 2);
+ var to = (long)Math.Min(closure.word.Length + 1, depth + (int)closure.maxEdits + 2);
var previousCharacter = depth > 0 ? closure.builder[depth - 1] : (char)0;
var previousRow = closure.matrix[depth];
var currentRow = closure.matrix[depth + 1];
for (var childEdge = firstChild; childEdge < lastChild; childEdge++)
{
var any = false;
var currentCharacter = closure.graph.EdgeCharacters[childEdge];
closure.builder[depth] = currentCharacter;
var calculatedCost = depth + 1;
var previousRowEntry = previousRow[from - 1];
var targetCharacter = (char)0;
for (var i = from; i < to; i++)
{
var previousTargetCharacter = targetCharacter;
targetCharacter = closure.word[i - 1];
var previousRowPreviousEntry = previousRowEntry;
previousRowEntry = previousRow[i];
if (currentCharacter == targetCharacter)
{
calculatedCost = previousRowPreviousEntry;
}
else
{
if (previousRowEntry < calculatedCost)
{
calculatedCost = previousRowEntry;
}
if (targetCharacter == previousCharacter
&& previousTargetCharacter == currentCharacter)
{
previousRowPreviousEntry = closure.matrix[depth - 1][i - 2];
}
if (previousRowPreviousEntry < calculatedCost)
{
calculatedCost = previousRowPreviousEntry;
}
calculatedCost++;
}
if (calculatedCost <= 0)
{
any = true;
}
currentRow[i] = calculatedCost;
}
if (!any)
{
continue;
}
var nextNode = closure.graph.EdgeToNodeIndex[childEdge];
if (nextNode < 0)
{
nextNode = -nextNode;
if (depth >= closure.word.Length - closure.maxEdits - 1
&& calculatedCost <= 0)
{
closure.results.Add(new SuggestItem(closure.builder.ToString(0, depth + 1), 0));
}
}
Recurse(nextNode, depth + 1, ref closure);
}
}
}
}
The instruction difference in the generate assembly is as follows:
- L0099: movsxd r15, ecx
- L009c: movsxd rcx, edi
- L009f: mov edx, eax
- L00a1: lea r12, [rcx+rdx+2]
+ L0099: lea edx, [rdi+rax+2]
- L00a6: cmp r15, r12
+ L009d: cmp ecx, edx
- L00a9: jle short L00ad
+ L009f: jle short L00a3
- L00ab: jmp short L00b0
+ L00a1: jmp short L00a5
- L00ad: mov r12, r15
+ L00a3: mov edx, ecx
+ L00a5: movsxd r15, edx
So the registers used changes and there is a single movxsd is at the end instead of two before the comparisons.
There are some downstream changes due to the register difference and the first sample finishes at L0339 while the second does so at L0336. The difference there being smaller than the difference between the first two instructions after the diff of L00b0 and L00a5 respectively.
I'm running this on a ryzen 3800X which uses the zen 2 architecture.
I've read Agner Fog's optimization manuals and nothing is striking me as an obvious cause for the slowdown. Maybe alignment boundaries are to blame but I don't know how to verify that.
The benchmark difference is about 740ms for 1000 calls to Lookup vs 770ms for the second version.