1
0
mirror of https://github.com/haiwen/seafile-server.git synced 2025-04-28 19:35:10 +00:00
seafile-server/common/cdc/cdc.c
2018-02-06 14:37:27 +08:00

252 lines
8.1 KiB
C

/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
#include "common.h"
#include "log.h"
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <string.h>
#include <sys/stat.h>
#include <errno.h>
#include <glib/gstdio.h>
#include "utils.h"
#include "cdc.h"
#include "../seafile-crypt.h"
#include "rabin-checksum.h"
#define finger rabin_checksum
#define rolling_finger rabin_rolling_checksum
#define BLOCK_SZ (1024*1024*1)
#define BLOCK_MIN_SZ (1024*256)
#define BLOCK_MAX_SZ (1024*1024*4)
#define BLOCK_WIN_SZ 48
#define NAME_MAX_SZ 4096
#define BREAK_VALUE 0x0013 ///0x0513
#define READ_SIZE 1024 * 4
#define BYTE_TO_HEX(b) (((b)>=10)?('a'+b-10):('0'+b))
static int default_write_chunk (CDCDescriptor *chunk_descr)
{
char filename[NAME_MAX_SZ];
char chksum_str[CHECKSUM_LENGTH *2 + 1];
int fd_chunk, ret;
memset(chksum_str, 0, sizeof(chksum_str));
rawdata_to_hex (chunk_descr->checksum, chksum_str, CHECKSUM_LENGTH);
snprintf (filename, NAME_MAX_SZ, "./%s", chksum_str);
fd_chunk = g_open (filename, O_RDWR | O_CREAT | O_BINARY, 0644);
if (fd_chunk < 0)
return -1;
ret = writen (fd_chunk, chunk_descr->block_buf, chunk_descr->len);
close (fd_chunk);
return ret;
}
static int init_cdc_file_descriptor (int fd,
uint64_t file_size,
CDCFileDescriptor *file_descr)
{
int max_block_nr = 0;
int block_min_sz = 0;
file_descr->block_nr = 0;
if (file_descr->block_min_sz <= 0)
file_descr->block_min_sz = BLOCK_MIN_SZ;
if (file_descr->block_max_sz <= 0)
file_descr->block_max_sz = BLOCK_MAX_SZ;
if (file_descr->block_sz <= 0)
file_descr->block_sz = BLOCK_SZ;
if (file_descr->write_block == NULL)
file_descr->write_block = (WriteblockFunc)default_write_chunk;
block_min_sz = file_descr->block_min_sz;
max_block_nr = ((file_size + block_min_sz - 1) / block_min_sz);
file_descr->blk_sha1s = (uint8_t *)calloc (sizeof(uint8_t),
max_block_nr * CHECKSUM_LENGTH);
file_descr->max_block_nr = max_block_nr;
return 0;
}
#define WRITE_CDC_BLOCK(block_sz, write_data) \
do { \
int _block_sz = (block_sz); \
chunk_descr.len = _block_sz; \
chunk_descr.offset = offset; \
ret = file_descr->write_block (file_descr->repo_id, \
file_descr->version, \
&chunk_descr, \
crypt, chunk_descr.checksum, \
(write_data)); \
if (ret < 0) { \
free (buf); \
g_warning ("CDC: failed to write chunk.\n"); \
return -1; \
} \
memcpy (file_descr->blk_sha1s + \
file_descr->block_nr * CHECKSUM_LENGTH, \
chunk_descr.checksum, CHECKSUM_LENGTH); \
SHA1_Update (&file_ctx, chunk_descr.checksum, 20); \
file_descr->block_nr++; \
offset += _block_sz; \
\
memmove (buf, buf + _block_sz, tail - _block_sz); \
tail = tail - _block_sz; \
cur = 0; \
}while(0);
/* content-defined chunking */
int file_chunk_cdc(int fd_src,
CDCFileDescriptor *file_descr,
SeafileCrypt *crypt,
gboolean write_data,
gint64 *indexed)
{
char *buf;
uint32_t buf_sz;
SHA_CTX file_ctx;
CDCDescriptor chunk_descr;
SHA1_Init (&file_ctx);
SeafStat sb;
if (seaf_fstat (fd_src, &sb) < 0) {
seaf_warning ("CDC: failed to stat: %s.\n", strerror(errno));
return -1;
}
uint64_t expected_size = sb.st_size;
init_cdc_file_descriptor (fd_src, expected_size, file_descr);
uint32_t block_min_sz = file_descr->block_min_sz;
uint32_t block_mask = file_descr->block_sz - 1;
int fingerprint = 0;
int offset = 0;
int ret = 0;
int tail, cur, rsize;
buf_sz = file_descr->block_max_sz;
buf = chunk_descr.block_buf = malloc (buf_sz);
if (!buf)
return -1;
/* buf: a fix-sized buffer.
* cur: data behind (inclusive) this offset has been scanned.
* cur + 1 is the bytes that has been scanned.
* tail: length of data loaded into memory. buf[tail] is invalid.
*/
tail = cur = 0;
while (1) {
if (tail < block_min_sz) {
rsize = block_min_sz - tail + READ_SIZE;
} else {
rsize = (buf_sz - tail < READ_SIZE) ? (buf_sz - tail) : READ_SIZE;
}
ret = readn (fd_src, buf + tail, rsize);
if (ret < 0) {
seaf_warning ("CDC: failed to read: %s.\n", strerror(errno));
free (buf);
return -1;
}
tail += ret;
file_descr->file_size += ret;
if (file_descr->file_size > expected_size) {
seaf_warning ("File size changed while chunking.\n");
free (buf);
return -1;
}
/* We've read all the data in this file. Output the block immediately
* in two cases:
* 1. The data left in the file is less than block_min_sz;
* 2. We cannot find the break value until the end of this file.
*/
if (tail < block_min_sz || cur >= tail) {
if (tail > 0) {
if (file_descr->block_nr == file_descr->max_block_nr) {
seaf_warning ("Block id array is not large enough, bail out.\n");
free (buf);
return -1;
}
gint64 idx_size = tail;
WRITE_CDC_BLOCK (tail, write_data);
if (indexed)
*indexed += idx_size;
}
break;
}
/*
* A block is at least of size block_min_sz.
*/
if (cur < block_min_sz - 1)
cur = block_min_sz - 1;
while (cur < tail) {
fingerprint = (cur == block_min_sz - 1) ?
finger(buf + cur - BLOCK_WIN_SZ + 1, BLOCK_WIN_SZ) :
rolling_finger (fingerprint, BLOCK_WIN_SZ,
*(buf+cur-BLOCK_WIN_SZ), *(buf + cur));
/* get a chunk, write block info to chunk file */
if (((fingerprint & block_mask) == ((BREAK_VALUE & block_mask)))
|| cur + 1 >= file_descr->block_max_sz)
{
if (file_descr->block_nr == file_descr->max_block_nr) {
seaf_warning ("Block id array is not large enough, bail out.\n");
free (buf);
return -1;
}
gint64 idx_size = cur + 1;
WRITE_CDC_BLOCK (cur + 1, write_data);
if (indexed)
*indexed += idx_size;
break;
} else {
cur ++;
}
}
}
SHA1_Final (file_descr->file_sum, &file_ctx);
free (buf);
return 0;
}
int filename_chunk_cdc(const char *filename,
CDCFileDescriptor *file_descr,
SeafileCrypt *crypt,
gboolean write_data,
gint64 *indexed)
{
int fd_src = seaf_util_open (filename, O_RDONLY | O_BINARY);
if (fd_src < 0) {
seaf_warning ("CDC: failed to open %s.\n", filename);
return -1;
}
int ret = file_chunk_cdc (fd_src, file_descr, crypt, write_data, indexed);
close (fd_src);
return ret;
}
void cdc_init ()
{
rabin_init (BLOCK_WIN_SZ);
}