Sunday, October 19, 2008

initialize upon the first use

This is a technique coverd in Programming Pearls.  (Chap 1, Problems 9)

This method use constant time for initialization and for each vector access, and use extra space proportional to the size of the vector. Because this method reduces initialization time by using even more space, it should be considered only when space is cheap, time is deer and the vector is sparse.

My test code here ( #define COL   to enable 2-D matrix test case)

#include <iostream>
#include <ctime>
using namespace std;

// max row and col size
#define ROW 8000
// if 1-d, just comment the next line
#define COL 8000
int top = 0;

#ifndef COL
int data[ROW];
#define MAXE ROW
#else
int data[ROW][COL];
#define MAXE ROW*COL
#endif

// two additional array to implement a simple signature
int from[MAXE];
int to[MAXE];

// if element i in the array is initialised
inline bool exist(int i, int j=0) {
#ifdef COL
int idx = COL*i+j;
#else
int idx = i;
#endif
if(from[idx] < top && to[from[idx]] == idx)
return true;
return false;
}

// assign data value to the i-th element
#ifndef COL
inline void setdata(int i, int v) {
if(!exist(i)) {
from[i] = top;
to[top] = i;
++ top;
}
data[i] = v;
}

#else
inline void setdata(int i, int j, int v) {
if(!exist(i, j)) {
int idx = COL*i+j;
from[idx] = top;
to[top] = idx;
++ top;
}
data[i][j] = v;
}
#endif

int main(int argc, char* argv[]) {
int startt = clock();
printf("initialize at start time--\n");
memset(data, 0, sizeof(data));

int n;
if(argc < 2 || atoi(argv[1]) > ROW) n = 100;
else n = atoi(argv[1]);

#ifndef COL
for(int i = 0; i < n; ++i)
data[i] = (i*i-i)/(i+1);
int starts = clock();
printf("use %d\n", starts-startt);

printf("no initialization--\n");
for(int i = 0; i < n; i++)
setdata(i, (i*i-i)/(i+1));
printf("use %d\n", clock()-starts);

#else
for(int i = 0; i < n; i++)
for(int j = 0; j < n; ++j)
data[i][j] = (i*j-i)/(i+1);

int starts = clock();
printf("use %d\n", starts-startt);

printf("no initialization--\n");
for(int i = 0; i < n; i++)
for(int j = 0; j < n; ++j)
setdata(i, j, (i*j-i)/(i+1));

printf("use %d\n", clock()-starts);
#endif
}
From the test result: 

  • When the operations overhead is much less than the initialization cost, use init upon the first use.
  • When the vector/matrix is not sparse (e.g., > 10% full), memset is fast enough.
  • It seems the access speed of alg2 is 10 times slower than direct access speed
    +------------+----------+-----------+----------+
    |matrix dim |# op |alg 1 (ms) |alg 2 (ms)|
    |8000 x 8000 |10 x 10 |312 |0 |
    |8000 x 8000 |100x100 |312 |0 |
    |8000 x 8000 |1000x1000 |328 |187 |
    |8000 x 8000 |4000x4000 |531 |2922 |
    +------------+----------+-----------+----------+
  •