/*
 * Storage layer implementation using a single backing file.
 * The file format is as follows:
 *
 * [block header]
 * [block descriptor 0]
 * [data]
 * [block descriptor 1]
 * [data]
 * ...
 */
#include <sys/types.h>
#include <sys/stat.h>

#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>

#include "block.h"
#include "compat.h"
#include "config.h"
#include "misc.h"
#include "queue.h"
#include "tree.h"

/* block header constants */
#define BHDRMAGIC	"DEDUPDIDUPDIDUP"
#define NBHDRMAGIC	16

#define VMIN		0
#define VMAJ		1
#define VMINMASK	0xff
#define VMAJSHIFT	8
#define VMAJMASK	0xff

#define BHDRSIZE	(NBHDRMAGIC + 8 + 8)

/* block descriptor constants */
#define BDTYPE		0x100
#define BDSIZE		(8 + 8 + 8 + 8 + (MDSIZE))

/* misc helpers */
extern int pack(unsigned char *, char *, ...);
extern int unpack(unsigned char *, char *, ...);

static int bscreat(struct bctx *, char *, int);
static int bsopen(struct bctx *, char *, int, int);
static int bsput(struct bctx *, void *, size_t, unsigned char *);
static int bsget(struct bctx *, unsigned char *, void *, size_t *);
static int bsrm(struct bctx *, unsigned char *);
static int bsgc(struct bctx *);
static int bssync(struct bctx *);
static int bsclose(struct bctx *);

static struct bops bops = {
	.creat = bscreat,
	.open = bsopen,
	.put = bsput,
	.get = bsget,
	.rm = bsrm,
	.gc = bsgc,
	.sync = bssync,
	.close = bsclose,
};

/* Block header structure */
struct bhdr {
	char magic[NBHDRMAGIC]; /* magic number for file(1) */
	uint64_t flags;		/* version number */
	uint64_t nbd;		/* number of block descriptors */
};

/* Block descriptor */
struct bd {
	uint16_t type;			/* BDTYPE */
	unsigned char reserved[6];	/* should be set to 0 when writing */
	uint64_t offset;		/* block offset */
	uint64_t size;			/* block size */
	uint64_t refcnt;		/* reference count of block, 0 if block is removed */
	unsigned char md[MDSIZE];	/* hash of block */
	RB_ENTRY(bd) rbe;		/* bdcache link node */
	SLIST_ENTRY(bd) sle;		/* gchead link node */
};
RB_HEAD(bdcache, bd);

/* Storage layer context */
struct sctx {
	struct bdcache bdcache;		/* cache of block descriptors */
	SLIST_HEAD(gchead, bd) gchead;	/* list of all blocks with a zero refcount */
	int fd;				/* underlying storage file descriptor */
	int rdonly;			/* when set to 1, the bssync() operation is a no-op */
	struct bhdr bhdr;		/* block header */
};

static int
bd_cmp(struct bd *b1, struct bd *b2)
{
	int r;

	r = memcmp(b1->md, b2->md, MDSIZE);
	if (r > 0)
		return 1;
	else if (r < 0)
		return -1;
	return 0;
}
static RB_PROTOTYPE(bdcache, bd, rbe, bd_cmp)
static RB_GENERATE(bdcache, bd, rbe, bd_cmp)

/* Unpack block header */
static int
unpackbhdr(unsigned char *buf, struct bhdr *bhdr)
{
	char fmt[BUFSIZ];
	int n;

	snprintf(fmt, sizeof(fmt), "'%dqq", NBHDRMAGIC);
	n = unpack(buf, fmt,
	           bhdr->magic,
	           &bhdr->flags,
	           &bhdr->nbd);

	assert(n == BHDRSIZE);
	return n;
}

/* Pack block header */
static int
packbhdr(unsigned char *buf, struct bhdr *bhdr)
{
	char fmt[BUFSIZ];
	int n;

	snprintf(fmt, sizeof(fmt), "'%dqq", NBHDRMAGIC);
	n = pack(buf, fmt,
	         bhdr->magic,
	         bhdr->flags,
	         bhdr->nbd);

	assert(n == BHDRSIZE);
	return n;
}

/* Unpack block descriptor */
static int
unpackbd(unsigned char *buf, struct bd *bd)
{
	char fmt[BUFSIZ];
	int n;

	snprintf(fmt, sizeof(fmt), "s'6qqq'%d", MDSIZE);
	n = unpack(buf, fmt,
	           &bd->type,
	           bd->reserved,
	           &bd->offset,
	           &bd->size,
	           &bd->refcnt,
	           bd->md);

	assert(n == BDSIZE);
	return n;
}

/* Write block descriptor */
static int
packbd(unsigned char *buf, struct bd *bd)
{
	char fmt[BUFSIZ];
	int n;

	snprintf(fmt, sizeof(fmt), "s'6qqq'%d", MDSIZE);
	n = pack(buf, fmt,
	         bd->type,
	         bd->reserved,
	         bd->offset,
	         bd->size,
	         bd->refcnt,
	         bd->md);

	assert(n == BDSIZE);
	return n;
}

/* Load block descriptor from file */
static int
loadbd(struct sctx *sctx)
{
	unsigned char bdbuf[BDSIZE];
	struct bd *bd;

	bd = calloc(1, sizeof(*bd));
	if (bd == NULL) {
		seterr("calloc: out of memory");
		return -1;
	}

	if (xread(sctx->fd, bdbuf, BDSIZE) != BDSIZE) {
		free(bd);
		seterr("failed to read block descriptor: %s",
		        strerror(errno));
		return -1;
	}
	unpackbd(bdbuf, bd);

	if (bd->type != BDTYPE) {
		free(bd);
		seterr("invalid block descriptor type: %d", bd->type);
		return -1;
	}

	/* Move to the next block descriptor */
	if (lseek(sctx->fd, bd->size, SEEK_CUR) < 0) {
		free(bd);
		seterr("lseek: %s", strerror(errno));
		return -1;
	}

	/*
	 * When refcount is 0 the block has been removed.
	 * In that case, the block descriptor is still present
	 * in the file as it is used to locate the next block
	 * descriptor which could be live.
	 *
	 * The garbage collection list links together all block
	 * descriptors that have a reference count of 0.
	 * This is needed to implement the gc operation.
	 */
	if (bd->refcnt > 0)
		RB_INSERT(bdcache, &sctx->bdcache, bd);
	else
		SLIST_INSERT_HEAD(&sctx->gchead, bd, sle);
	return 0;
}

/* Initialize block descriptor cache */
static int
initbdcache(struct sctx *sctx)
{
	struct bhdr *bhdr;
	uint64_t i;

	bhdr = &sctx->bhdr;
	for (i = 0; i < bhdr->nbd; i++) {
		struct bd *bd, *tmp;

		if (loadbd(sctx) == 0)
			continue;

		/* Free block descriptor cache */
		RB_FOREACH_SAFE(bd, bdcache, &sctx->bdcache, tmp) {
			RB_REMOVE(bdcache, &sctx->bdcache, bd);
			free(bd);
		}

		/* Free garbage collector list */
		while (!SLIST_EMPTY(&sctx->gchead)) {
			bd = SLIST_FIRST(&sctx->gchead);
			SLIST_REMOVE(&sctx->gchead, bd, bd, sle);
			free(bd);
		}
		return -1;
	}
	return 0;
}

/* Create storage file */
static int
bscreat(struct bctx *bctx, char *path, int mode)
{
	unsigned char bhdrbuf[BHDRSIZE];
	struct sctx *sctx;
	struct bhdr *bhdr;
	int fd;

	fd = open(path, O_RDWR | O_CREAT | O_EXCL, mode);
	if (fd < 0) {
		seterr("open: %s", strerror(errno));
		return -1;
	}

	bctx->sctx = calloc(1, sizeof(struct sctx));
	if (bctx->sctx == NULL) {
		close(fd);
		seterr("calloc: out of memory");
		return -1;
	}

	sctx = bctx->sctx;
	RB_INIT(&sctx->bdcache);
	SLIST_INIT(&sctx->gchead);
	sctx->fd = fd;

	bhdr = &sctx->bhdr;
	memcpy(bhdr->magic, BHDRMAGIC, NBHDRMAGIC);
	bhdr->flags = (VMAJ << VMAJSHIFT) | VMIN;
	bhdr->nbd = 0;

	packbhdr(bhdrbuf, bhdr);
	if (xwrite(fd, bhdrbuf, BHDRSIZE) != BHDRSIZE) {
		free(sctx);
		close(fd);
		seterr("failed to write block header: %s", strerror(errno));
		return -1;
	}
	return 0;
}

/* Open storage file */
static int
bsopen(struct bctx *bctx, char *path, int flags, int mode)
{
	unsigned char bhdrbuf[BHDRSIZE];
	struct sctx *sctx;
	struct bhdr *bhdr;
	int fd;

	switch (flags) {
	case B_READ:
		flags = O_RDONLY;
		break;
	case B_RDWR:
		flags = O_RDWR;
		break;
	default:
		seterr("invalid params");
		return -1;
	}

	fd = open(path, flags, mode);
	if (fd < 0) {
		seterr("open: %s", strerror(errno));
		return -1;
	}

	bctx->sctx = calloc(1, sizeof(struct sctx));
	if (bctx->sctx == NULL) {
		close(fd);
		seterr("calloc: out of memory");
		return -1;
	}

	sctx = bctx->sctx;
	RB_INIT(&sctx->bdcache);
	SLIST_INIT(&sctx->gchead);
	bhdr = &sctx->bhdr;

	if (xread(fd, bhdrbuf, BHDRSIZE) != BHDRSIZE) {
		free(sctx);
		close(fd);
		seterr("failed to read block header: %s", strerror(errno));
		return -1;
	}
	unpackbhdr(bhdrbuf, bhdr);

	if (memcmp(bhdr->magic, BHDRMAGIC, NBHDRMAGIC) != 0) {
		free(sctx);
		close(fd);
		seterr("unknown block header magic");
		return -1;
	}

	/* If the major version is different, the format is incompatible */
	if (((bhdr->flags >> VMAJSHIFT) & VMAJMASK) != VMAJ) {
		free(sctx);
		close(fd);
		seterr("block header version mismatch");
		return -1;
	}

	sctx->fd = fd;
	sctx->rdonly = flags == O_RDONLY;

	if (initbdcache(sctx) < 0) {
		free(sctx);
		close(fd);
		return -1;
	}
	return 0;
}

/* Write a block to the storage file */
static int
bsput(struct bctx *bctx, void *buf, size_t n, unsigned char *md)
{
	unsigned char bdbuf[BDSIZE];
	struct sctx *sctx;
	struct bhdr *bhdr;
	struct bd key, *bd;
	off_t offs;

	/*
	 * If the block is already present in the cache
	 * just increment the reference count and write back
	 * the block descriptor associated with that block.
	 */
	sctx = bctx->sctx;
	memcpy(key.md, md, MDSIZE);
	bd = RB_FIND(bdcache, &sctx->bdcache, &key);
	if (bd != NULL) {
		off_t bdoffs;

		bdoffs = bd->offset - BDSIZE;
		if (lseek(sctx->fd, bdoffs, SEEK_SET) < 0) {
			seterr("lseek: %s", strerror(errno));
			return -1;
		}

		bd->refcnt++;
		packbd(bdbuf, bd);
		if (xwrite(sctx->fd, bdbuf, BDSIZE) != BDSIZE) {
			bd->refcnt--;
			seterr("failed to write block descriptor: %s",
			        strerror(errno));
			return -1;
		}

		memcpy(md, bd->md, MDSIZE);
		return 0;
	}

	/* New blocks are appended at the end of storage file */
	offs = lseek(sctx->fd, 0, SEEK_END);
	if (offs < 0) {
		seterr("lseek: %s", strerror(errno));
		return -1;
	}

	bd = calloc(1, sizeof(*bd));
	if (bd == NULL) {
		seterr("calloc: out of memory");
		return -1;
	}
	bd->type = BDTYPE;
	bd->offset = offs + BDSIZE;
	bd->size = n;
	bd->refcnt = 1;
	memcpy(bd->md, key.md, MDSIZE);

	packbd(bdbuf, bd);
	if (xwrite(sctx->fd, bdbuf, BDSIZE) != BDSIZE) {
		/* Shouldn't fail but if it does rewind storage file state */
		ftruncate(sctx->fd, offs);
		free(bd);
		seterr("failed to write block descriptor: %s",
		        strerror(errno));
		return -1;
	}

	if (xwrite(sctx->fd, buf, n) != n) {
		/* Shouldn't fail but if it does rewind storage file state */
		ftruncate(sctx->fd, offs);
		free(bd);
		seterr("failed to write block: %s", strerror(errno));
		return -1;
	}

	/*
	 * Update block entry header.
	 * The header will be written to the storage file
	 * when bsclose() or bssync() is called.
	 */
	bhdr = &sctx->bhdr;
	bhdr->nbd++;

	RB_INSERT(bdcache, &sctx->bdcache, bd);
	memcpy(md, bd->md, MDSIZE);
	return bd->size;
}

/* Read a block from the storage file */
static int
bsget(struct bctx *bctx, unsigned char *md, void *buf, size_t *n)
{
	struct sctx *sctx;
	struct bd key, *bd;

	sctx = bctx->sctx;
	memcpy(key.md, md, MDSIZE);
	bd = RB_FIND(bdcache, &sctx->bdcache, &key);
	if (bd == NULL) {
		seterr("block not found");
		return -1;
	}

	if (*n < bd->size) {
		seterr("buffer too small");
		return -1;
	}

	if (lseek(sctx->fd, bd->offset, SEEK_SET) < 0) {
		seterr("lseek: %s", strerror(errno));
		return -1;
	}
	if (xread(sctx->fd, buf, bd->size) != bd->size) {
		seterr("failed to read block: %s", strerror(errno));
		return -1;
	}
	*n = bd->size;
	return 0;
}

/* Remove a block with the given hash */
static int
bsrm(struct bctx *bctx, unsigned char *md)
{
	unsigned char bdbuf[BDSIZE];
	struct sctx *sctx;
	struct bd key, *bd;
	off_t bdoffs;

	sctx = bctx->sctx;
	memcpy(key.md, md, MDSIZE);
	bd = RB_FIND(bdcache, &sctx->bdcache, &key);
	if (bd == NULL) {
		seterr("block not found");
		return -1;
	}

	bdoffs = bd->offset - BDSIZE;
	if (lseek(sctx->fd, bdoffs, SEEK_SET) < 0) {
		seterr("lseek: %s", strerror(errno));
		return -1;
	}

	bd->refcnt--;
	packbd(bdbuf, bd);
	if (xwrite(sctx->fd, bdbuf, BDSIZE) != BDSIZE) {
		bd->refcnt++;
		seterr("failed to write block descriptor: %s",
		        strerror(errno));
		return -1;
	}

	/* This block is still referenced so just return */
	if (bd->refcnt > 0)
		return 0;

	if (punchhole(sctx->fd, bd->offset, bd->size) < 0) {
		/*
		 * Filesystem does not support hole punching.
		 * Restore reference count.
		 */
		lseek(sctx->fd, bdoffs, SEEK_SET);
		bd->refcnt++;
		packbd(bdbuf, bd);
		xwrite(sctx->fd, bdbuf, BDSIZE);
		seterr("operation not supported");
		return -1;
	}

	/*
	 * Remove block from block descriptor cache as this is no
	 * longer a valid block.  Insert it into the garbage collector
	 * list instead.
	 */
	RB_REMOVE(bdcache, &sctx->bdcache, bd);
	SLIST_INSERT_HEAD(&sctx->gchead, bd, sle);
	return 0;
}

/*
 * Re-punch all holes in the storage file.
 * This is needed when the storage file is copied from
 * one system to another and back.  The target system
 * may not support hole punching so the holes will be
 * filled with literal zeroes, negating the space saving
 * effects.
 */
static int
bsgc(struct bctx *bctx)
{
	struct sctx *sctx;
	struct bd *bd;

	sctx = bctx->sctx;
	SLIST_FOREACH(bd, &sctx->gchead, sle) {
		assert(bd->refcnt == 0);
		punchhole(sctx->fd, bd->offset, bd->size);
	}
	return 0;
}

/* Sync block header to storage file */
static int
bssync(struct bctx *bctx)
{
	unsigned char bhdrbuf[BHDRSIZE];
	struct sctx *sctx;
	struct bhdr *bhdr;

	sctx = bctx->sctx;
	if (sctx->rdonly)
		return 0;

	if (lseek(sctx->fd, 0, SEEK_SET) < 0) {
		seterr("lseek: %s", strerror(errno));
		return -1;
	}

	bhdr = &sctx->bhdr;
	packbhdr(bhdrbuf, bhdr);
	if (xwrite(sctx->fd, bhdrbuf, BHDRSIZE) != BHDRSIZE) {
		seterr("failed to write block header: %s", strerror(errno));
		return -1;
	}
	fsync(sctx->fd);
	return 0;
}

/* Close storage handle */
static int
bsclose(struct bctx *bctx)
{
	struct sctx *sctx;
	struct bd *bd, *tmp;
	int r;

	/* Free block descriptor cache */
	sctx = bctx->sctx;
	RB_FOREACH_SAFE(bd, bdcache, &sctx->bdcache, tmp) {
		RB_REMOVE(bdcache, &sctx->bdcache, bd);
		free(bd);
	}

	/* Free garbage collector list */
	while (!SLIST_EMPTY(&sctx->gchead)) {
		bd = SLIST_FIRST(&sctx->gchead);
		SLIST_REMOVE(&sctx->gchead, bd, bd, sle);
		free(bd);
	}

	r = close(sctx->fd);
	free(sctx);
	if (r < 0)
		seterr("close: %s", strerror(errno));
	return r;
}

struct bops *
bstorageops(void)
{
	return &bops;
}
