// Fungimol - an extensible system for designing atomic-scale objects.
// Copyright (C) 2000 Tim Freeman
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Library General Public
// License as published by the Free Software Foundation; either
// version 2 of the License, or (at your option) any later version.
// 
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Library General Public License for more details.
// 
// You should have received a copy of the GNU Library General Public
// License along with this library in the file COPYING.txt; if not,
// write to the Free Software Foundation, Inc., 59 Temple Place -
// Suite 330, Boston, MA 02111-1307, USA
//
// The author can be reached by email at tim@infoscreen.com, or by
// paper mail at:
//
// Tim Freeman
// 655 S. FairOaks Ave., Apt B-316
// Sunnyvale, CA 94086
//

#include "Dynavec.h"

#ifdef UNIT_TEST

// Compile command:
// g++ -Wall -Winline -W -Wwrite-strings -Werror -pipe -felide-constructors -DNDEBUG -O3 -march=pentiumpro -malign-double -mwide-multiply -ffast-math -fomit-frame-pointer -fno-exceptions /home/tim/fungimol/Util/Dynavec.cpp -DUNIT_TEST -o /tmp/a.out nodebug/memory.o nodebug/myassert.o

#include "myassert.h"

#ifndef NDEBUG

int main () {
  Dynavec <int> l;
  l.extendTo (1000);
  for (int j = 0; j < 10; j++) {
    for (Dynavec<int>::Iterator i = l;
	 !i.isDone();
	 i.Next()) {
      *i = 0;
    }
  }
}

#else

#if 0
// Time for the following is:
// 0.590u 0.000s 0:00.59 100.0%	0+0k 0+0io 127pf+0w
// 0.600u 0.000s 0:00.59 101.6%	0+0k 0+0io 127pf+0w
// 0.590u 0.000s 0:00.59 100.0%	0+0k 0+0io 127pf+0w
// 0.600u 0.000s 0:00.59 101.6%	0+0k 0+0io 127pf+0w
// Inner loop is:
//.L68:
//	movl 16(%esp),%eax
//	movl $0,(%eax,%edx,4)
//	incl %edx
//	cmpl %ecx,%edx
//	jl .L68
// Notice that the code for finding &l[0] didn't manage to get hoisted
// out of the loop.
int main () {
  Dynavec <int> l;
  l.extendTo (1000);
  for (int j = 0; j < 100000; j++) {
    const int s = l.size();
    for (int i = 0; i < s; i++) {
      l [i] = 0;
    }
  }
}
#endif

#if 0

// Time for the following is:
// 1.000u 0.020s 0:01.01 100.9%	0+0k 0+0io 127pf+0w
// 1.010u 0.010s 0:01.01 100.9%	0+0k 0+0io 127pf+0w
// 1.010u 0.000s 0:01.01 100.0%	0+0k 0+0io 127pf+0w
// 1.020u 0.000s 0:01.01 100.9%	0+0k 0+0io 127pf+0w
// Inner loop is:
//.L91:
//	movl $0,(%eax)
//	movl 16(%esp),%eax
//	leal 4(%eax),%edi
//	movl %edi,16(%esp)
//	movl %edi,%eax
//	cmpl 4(%ebx),%eax
//	jb .L91
// 16(%esp) and 4(%ebx) and 4(%16(%esp)) never made it into a register.
int main () {
  Dynavec <int> l;
  l.extendTo (1000);
  for (int j = 0; j < 100000; j++) {
    for (Dynavec<int>::Iterator i = l;
	 !i.isDone();
	 i.Next()) {
      *i = 0;
    }
  }
}
#endif
#if 0
//0.510u 0.000s 0:00.51 100.0%	0+0k 0+0io 127pf+0w
//0.500u 0.010s 0:00.51 100.0%	0+0k 0+0io 127pf+0w
//0.510u 0.000s 0:00.51 100.0%	0+0k 0+0io 127pf+0w
//0.510u 0.000s 0:00.51 100.0%	0+0k 0+0io 127pf+0w
// Here's the best possible inner loop without unrolling, but we're
// probably limited by the jb messing up the pipeline so it doesn't
// matter that this loop has fewer instructions than the one with the l.size()
// addition below.
// .L77:
// 	movl $0,(%eax)
// 	addl $4,%eax
// 	cmpl %ecx,%eax
// 	jb .L77
int main () {
  Dynavec <int> l;
  l.extendTo (1000);
  for (int j = 0; j < 100000; j++) {
    const int *k = (&l[0])+l.size();
    for (int *i = &l[0]; i < k; i++) {
      *i = 0;
    }
  }
}
#endif
#if 0
// Unrolling really helps.
// 0.260u 0.000s 0:00.26 100.0%	0+0k 0+0io 127pf+0w
// 0.260u 0.000s 0:00.26 100.0%	0+0k 0+0io 127pf+0w
// 0.260u 0.000s 0:00.26 100.0%	0+0k 0+0io 127pf+0w
// 0.260u 0.000s 0:00.26 100.0%	0+0k 0+0io 127pf+0w
int main () {
  Dynavec <int> l;
  l.extendTo (1000);
  for (int j = 0; j < 100000; j++) {
    const int s = (l.size() >> 2) << 2;
    int *const k = ((&l[0])+s) ;
    for (int *i = &l[0]; i < k;) {
      *i++ = 0;
      *i++ = 0;
      *i++ = 0;
      *i++ = 0;
    }
    int *const q = &l[0]+l.size();
    for (int *i = k; i < q; i++) {
      *i = 0;
    }
  }
}
#endif
#if 0
// But more unrolling may not help much more.
// 0.240u 0.000s 0:00.24 100.0%	0+0k 0+0io 128pf+0w
// 0.240u 0.000s 0:00.23 104.3%	0+0k 0+0io 128pf+0w
// 0.240u 0.000s 0:00.24 100.0%	0+0k 0+0io 128pf+0w
// 0.240u 0.000s 0:00.24 100.0%	0+0k 0+0io 128pf+0w
int main () {
  Dynavec <int> l;
  l.extendTo (1000);
  for (int j = 0; j < 100000; j++) {
    const int s = (l.size() >> 3) << 3;
    int *const k = ((&l[0])+s) ;
    for (int *i = &l[0]; i < k;) {
      *i++ = 0;
      *i++ = 0;
      *i++ = 0;
      *i++ = 0;
      *i++ = 0;
      *i++ = 0;
      *i++ = 0;
      *i++ = 0;
    }
    int *const q = &l[0]+l.size();
    for (int *i = k; i < q; i++) {
      *i = 0;
    }
  }
}
#endif
#if 1
// The system routine isn't much better than unrolling to 4, and is
// worse than unrolling to 8.
// 0.250u 0.010s 0:00.25 104.0%	0+0k 0+0io 128pf+0w
// 0.250u 0.000s 0:00.25 100.0%	0+0k 0+0io 128pf+0w
// 0.250u 0.000s 0:00.25 100.0%	0+0k 0+0io 128pf+0w
// 0.250u 0.000s 0:00.25 100.0%	0+0k 0+0io 128pf+0w
#include <string.h>
int main () {
  Dynavec <int> l;
  l.extendTo (1000);
  for (int j = 0; j < 100000; j++) {
    memset ((void *) &l[0], 0, l.size() << 2);
  }
}
#endif
#if 0
// The conclusion I draw from all this is that it's easy to be memory
// bandwidth limited, so optimization means touching as little memory
// as possible.
// Here's the code for Dynavec<int>::Iterator that I took out because
// it didn't speed things up:
  // This iterator may break if the dynavec is extended to a greater
  // length while iterating.
  class Iterator {
    T *m_here;
    T *const m_last;
#ifndef NDEBUG
    T *m_check;
    Dynavec * m_start;
#endif
    inline void check () const {
      assert (&((*m_start)[0]) == m_check);
    }
  public:
    inline T& operator* () {
      check ();
      return *m_here;
    }
    inline bool isDone () const {
      check ();
      return m_here >= m_last;
    }
    inline void Next () {
      check ();
      m_here++;
    }
    Iterator (Dynavec &v)
      : m_here (&v[0]), m_last ((&v[0]) + v.size())
#ifndef NDEBUG
      , m_check (&v[0]), m_start (&v)
#endif
    {}
  };
#endif
#endif
#endif
