Recently I worked on an interesting problem. My friend was developing an add-in for Excel 2010 and had a requirement to read content of each cell into a DataTable. Due to some restrictions with "named range" feature, he had to resort to reading value of each cell in a selected area. The code seemed alright but for some unexplained reason it was very slow.
The add-in used standard Visual Studio template for Office add-in projects and referenced Office Interop assemblies for performing operations on an excel file. The main code used a "for" loop through each cell and used Range.Cells[rowindex, columnindex].Value to read value from cell.
The content in the selected range of sample file was also pretty small in size as it had 26 columns and 325 rows and each cell consisted of small string values e.g. "Test". Still, the add-in took about 18-20 seconds for reading the content.
After going through some results from web search, i came across this link in which it is suggested to assign Range.Value into a local variable and loop through that instead of looping through Range.Cells. After changing the code, the add-in was able to complete the operations in less than a second :).
Lesson: Marshaling values between managed and unmanaged space is expensive and must be minimized.
Calls to Range.Cells[i,j].Value are expensive and on an average took 2 milliseconds per call on my machine (dual core, 8 GB RAM) for the sample file. Below is a sample code i wrote to showcase the performance gain when Range.Values are used instead of Range.Cells[i,j].Value.
static void Main(string[] args)
{
var app = new Application();
Workbook workbook = null;
try
{
workbook = app.Workbooks.Open(@"D:\Excel.xlsx");
Range rng = ((Worksheet)workbook.Sheets[1]).UsedRange;
var arrValues = rng.Value;
int maxrow = rng.Rows.Count;
int maxcol = rng.Columns.Count;
System.Data.DataTable dTable = new System.Data.DataTable("test");
for (int i = 0; i < 26; i++)
{
dTable.Columns.Add();
}
Stopwatch stopwatch = new Stopwatch();
stopwatch.Start();
ReadExcelIntoDataTable(arrValues, maxrow, maxcol, dTable);
stopwatch.Stop();
Console.WriteLine(stopwatch.ElapsedMilliseconds);
long[] arr = new long[maxrow * maxcol];
dTable.Clear();
stopwatch.Restart();
ReadExcelIntoDataTable2(rng, maxrow, maxcol, dTable, arr);
stopwatch.Stop();
Console.WriteLine(stopwatch.ElapsedMilliseconds);
Console.WriteLine("Total cell read time : {0}", arr.Sum());
Console.WriteLine("Average cell read time : {0}",arr.Average());
Console.WriteLine("Min cell read time : {0}", arr.Min());
Console.WriteLine("Max cell read time : {0}", arr.Max());
Console.WriteLine("Total items : {0}", maxcol * maxrow);
}
finally
{
if (workbook != null)
{
workbook.Close();
Marshal.ReleaseComObject(workbook);
}
if (app != null)
{
Marshal.ReleaseComObject(app);
}
}
Console.ReadLine();
}
private static void ReadExcelIntoDataTable(dynamic arrValues, int maxrow, int maxcol, System.Data.DataTable dTable)
{
DataRow dRow;
for (int i = 2; i <= maxrow; i++)
{
dRow = dTable.NewRow();
for (int j = 1; j <= maxcol; j++)
{
dRow[j - 1] = arrValues[i, j];
}
dTable.Rows.Add(dRow);
}
}
private static void ReadExcelIntoDataTable2(Range rng, int maxrow, int maxcol, System.Data.DataTable dTable, long[] arr)
{
int arrCounter = 0;
DataRow dRow;
Stopwatch stopwatch = new Stopwatch();
for (int i = 2; i <= maxrow; i++)
{
dRow = dTable.NewRow();
for (int j = 1; j <= maxcol; j++)
{
stopwatch.Restart();
dRow[j - 1] = rng.Cells[i, j].Value;
stopwatch.Stop();
arr[arrCounter] = stopwatch.ElapsedMilliseconds;
arrCounter++;
}
dTable.Rows.Add(dRow);
}
}
Below is the output of the program. Notice the reduction in execution time from 24+ seconds to 41 milliseconds :)
41
24150
Total cell read time : 19818
Average cell read time : 2.34532544378698
Min cell read time : 0
Max cell read time : 208
Total items : 8450