blob: 2dc2a1ad4723adebc1ee5b106a7bc3e0134f983e [file] [log] [blame]
/*
* vim:noexpandtab:shiftwidth=8:tabstop=8:
*
* Copyright (C) Red Hat Inc., 2014
* Author: Jiffin Tony Thottan jthottan@redhat.com
* Anand Subramanian anands@redhat.com
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* -------------
*/
#include <fcntl.h>
#include "fsal.h"
#include "fsal_types.h"
#include "fsal_api.h"
#include "fsal_up.h"
#include "gluster_internal.h"
#include "FSAL/fsal_commonlib.h"
#include "FSAL/fsal_config.h"
#include "fsal_convert.h"
#include "pnfs_utils.h"
#include "nfs_exports.h"
#include <arpa/inet.h>
#include <sys/socket.h>
#include <netdb.h>
#define get16bits(d) (*((const uint16_t *) (d)))
#define MAX_DS_COUNT 100
/**
* @brief Get layout types supported by export
*
* We just return a pointer to the single type and set the count to 1.
*
* @param[in] export_pub Public export handle
* @param[out] count Number of layout types in array
* @param[out] types Static array of layout types that must not be
* freed or modified and must not be dereferenced
* after export reference is relinquished
*/
static void fs_layouttypes(struct fsal_export *export_pub, int32_t *count,
const layouttype4 **types)
{
/* Only supported layout type is file */
static const layouttype4 supported_layout_type = LAYOUT4_NFSV4_1_FILES;
*types = &supported_layout_type;
*count = 1;
}
/**
* @brief Get layout block size for export
*
* This function just returns the Gluster default.
*
* @param[in] export_pub Public export handle
*
* @return 4 MB.
*/
static uint32_t fs_layout_blocksize(struct fsal_export *export_pub)
{
return 0x400000;
}
/**
* @brief Maximum number of segments we will use
*
* Since current clients only support 1, that's what we'll use.
*
* @param[in] export_pub Public export handle
*
* @return 1
*/
static uint32_t fs_maximum_segments(struct fsal_export *export_pub)
{
return 1;
}
/**
* @brief Size of the buffer needed for a loc_body
*
* Just a handle plus a bit.
*
* @param[in] export_pub Public export handle
*
* @return Size of the buffer needed for a loc_body
*/
static size_t fs_loc_body_size(struct fsal_export *export_pub)
{
return 0x100;
}
/**
* @brief Size of the buffer needed for a ds_addr
*
* This one is huge, due to the striping pattern.
*
* @param[in] export_pub Public export handle
*
* @return Size of the buffer needed for a ds_addr
*/
size_t fs_da_addr_size(struct fsal_module *fsal_hdl)
{
return 0x1400;
}
int glfs_get_ds_addr(struct glfs *fs, struct glfs_object *object,
uint32_t *ds_addr);
/**
* @brief Grant a layout segment.
*
* Grants whole layout of the file requested.
*
* @param[in] obj_pub Public object handle
* @param[in] req_ctx Request context
* @param[out] loc_body An XDR stream to which the FSAL must encode
* the layout specific portion of the granted
* layout segment.
* @param[in] arg Input arguments of the function
* @param[in,out] res In/out and output arguments of the function
*
* @return Valid error codes in RFC 5661, pp. 366-7.
*/
static nfsstat4 pnfs_layout_get(struct fsal_obj_handle *obj_pub,
struct req_op_context *req_ctx,
XDR *loc_body,
const struct fsal_layoutget_arg *arg,
struct fsal_layoutget_res *res)
{
struct glusterfs_export *export =
container_of(req_ctx->fsal_export,
struct glusterfs_export, export);
struct glusterfs_handle *handle =
container_of(obj_pub, struct glusterfs_handle, handle);
int rc = 0;
/* Structure containing the storage parameters of the file within
glusterfs. */
struct glfs_file_layout file_layout;
/* Utility parameter */
nfl_util4 util = 0;
/* Stores Data server address */
struct pnfs_deviceid deviceid = DEVICE_ID_INIT_ZERO(FSAL_ID_GLUSTER);
nfsstat4 nfs_status = NFS4_OK;
/* Descriptor for DS handle */
struct gsh_buffdesc ds_desc;
/* DS wire handle send to client */
struct glfs_ds_wire ds_wire;
/* Supports only LAYOUT4_NFSV4_1_FILES layouts */
if (arg->type != LAYOUT4_NFSV4_1_FILES) {
LogMajor(COMPONENT_PNFS, "Unsupported layout type: %x",
arg->type);
return NFS4ERR_UNKNOWN_LAYOUTTYPE;
}
memset(&file_layout, 0, sizeof(struct glfs_file_layout));
/**
* Currently whole file is given as file layout,
*
* Stripe type is dense which is supported right now.
* Stripe length is max possible length of file that
* can be accessed by the client to perform a read or
* write.
*/
file_layout.stripe_type = NFL4_UFLG_DENSE;
file_layout.stripe_length = 0x100000;
util |= file_layout.stripe_type | file_layout.stripe_length;
rc = glfs_get_ds_addr(export->gl_fs, handle->glhandle,
&deviceid.device_id4);
if (rc) {
LogMajor(COMPONENT_PNFS, "Invalid hostname for DS");
return NFS4ERR_INVAL;
}
/** @todo: When more than one client tries access the same layout
* for the write operation, then last write will overwrite
* for the write operation, then last write will overwrite
* the previous ones, the MDS should intelligently deal
* those scenarios
*/
/* We return exactly one wirehandle, filling in the necessary
* information for the DS server to speak to the gluster bricks
* For this, wire handle stores gfid and file layout
*/
rc = glfs_h_extract_handle(handle->glhandle, ds_wire.gfid,
GFAPI_HANDLE_LENGTH);
if (rc < 0) {
LogMajor(COMPONENT_PNFS, "Invalid glfs_object");
return posix2nfs4_error(-rc);
}
ds_wire.layout = file_layout;
ds_desc.addr = &ds_wire;
ds_desc.len = sizeof(struct glfs_ds_wire);
nfs_status = FSAL_encode_file_layout(loc_body, &deviceid, util, 0, 0,
&req_ctx->ctx_export->export_id, 1,
&ds_desc);
if (nfs_status) {
LogMajor(COMPONENT_PNFS,
"Failed to encode nfsv4_1_file_layout.");
goto out;
}
/* We grant only one segment, and we want it back
* when the file is closed.
*/
res->return_on_close = true;
res->last_segment = true;
out:
return nfs_status;
}
/**
* @brief Potentially return one layout segment
*
* Since we don't make any reservations, in this version, or get any
* pins to release, always succeed
*
* @param[in] obj_pub Public object handle
* @param[in] req_ctx Request context
* @param[in] lrf_body Nothing for us
* @param[in] arg Input arguments of the function
*
* @return Valid error codes in RFC 5661, p. 367.
*/
static nfsstat4 pnfs_layout_return(struct fsal_obj_handle *obj_pub,
struct req_op_context *req_ctx,
XDR *lrf_body,
const struct fsal_layoutreturn_arg *arg)
{
if (arg->lo_type != LAYOUT4_NFSV4_1_FILES) {
LogDebug(COMPONENT_PNFS, "Unsupported layout type: %x",
arg->lo_type);
return NFS4ERR_UNKNOWN_LAYOUTTYPE;
}
return NFS4_OK;
}
/**
* @brief Commit a segment of a layout
*
* Update the size and time for a file accessed through a layout.
*
* @param[in] obj_pub Public object handle
* @param[in] req_ctx Request context
* @param[in] lou_body An XDR stream containing the layout
* type-specific portion of the LAYOUTCOMMIT
* arguments.
* @param[in] arg Input arguments of the function
* @param[in,out] res In/out and output arguments of the function
*
* @return Valid error codes in RFC 5661, p. 366.
*/
static nfsstat4 pnfs_layout_commit(struct fsal_obj_handle *obj_pub,
struct req_op_context *req_ctx,
XDR *lou_body,
const struct fsal_layoutcommit_arg *arg,
struct fsal_layoutcommit_res *res)
{
/* Old stat, so we don't truncate file or reverse time */
struct stat old_stat;
/* new stat to set time and size */
struct stat new_stat;
struct glusterfs_export *glfs_export =
container_of(op_ctx->fsal_export,
struct glusterfs_export, export);
struct glusterfs_handle *objhandle =
container_of(obj_pub, struct glusterfs_handle, handle);
/* Mask to determine exactly what gets set */
int mask = 0;
int rc = 0;
if (arg->type != LAYOUT4_NFSV4_1_FILES) {
LogMajor(COMPONENT_PNFS, "Unsupported layout type: %x",
arg->type);
return NFS4ERR_UNKNOWN_LAYOUTTYPE;
}
/* Gets previous status of file in the MDS */
rc = glfs_h_stat(glfs_export->gl_fs,
objhandle->glhandle, &old_stat);
if (rc != 0) {
LogMajor(COMPONENT_PNFS,
"Commit layout, stat unsucessfully completed");
return NFS4ERR_INVAL;
}
memset(&new_stat, 0, sizeof(struct stat));
/* Set the new attributes for the file if it is changed */
if (arg->new_offset) {
if (old_stat.st_size < arg->last_write + 1) {
new_stat.st_size = arg->last_write + 1;
res->size_supplied = true;
res->new_size = arg->last_write + 1;
rc = glfs_h_truncate(glfs_export->gl_fs,
objhandle->glhandle,
res->new_size);
if (rc != 0) {
LogMajor(COMPONENT_PNFS,
"Commit layout, size changed unsucessfully completed");
return NFS4ERR_INVAL;
}
}
}
if ((arg->time_changed) &&
(arg->new_time.seconds > old_stat.st_mtime))
new_stat.st_mtime = arg->new_time.seconds;
else
new_stat.st_mtime = time(NULL);
mask |= GLAPI_SET_ATTR_MTIME;
rc = glfs_h_setattrs(glfs_export->gl_fs,
objhandle->glhandle,
&new_stat,
mask);
if (rc != 0) {
LogMajor(COMPONENT_PNFS,
"commit layout, setattr unsucessflly completed");
return NFS4ERR_INVAL;
}
res->commit_done = true;
return NFS4_OK;
}
/**
* @brief Describes the DS information for the client
*
* @param[in] export_pub Public export handle
* @param[out] da_addr_body Stream we write the result to
* @param[in] type Type of layout that gave the device
* @param[in] deviceid The device to look up
*
* @return Valid error codes in RFC 5661, p. 365.
*/
nfsstat4 getdeviceinfo(struct fsal_module *fsal_hdl,
XDR *da_addr_body, const layouttype4 type,
const struct pnfs_deviceid *deviceid)
{
nfsstat4 nfs_status = 0;
/* Stores IP address of DS */
fsal_multipath_member_t host;
/* Entire file layout will be situated inside ONE DS
* And whole file is provided to the DS, so the starting
* index for that file is zero
*/
unsigned num_ds = 1;
uint32_t stripes = 1;
uint32_t stripe_ind = 0;
if (type != LAYOUT4_NFSV4_1_FILES) {
LogMajor(COMPONENT_PNFS, "Unsupported layout type: %x", type);
return NFS4ERR_UNKNOWN_LAYOUTTYPE;
}
if (!inline_xdr_u_int32_t(da_addr_body, &stripes)) {
LogMajor(COMPONENT_PNFS,
"Failed to encode length of stripe_indices array: %"
PRIu32 ".", stripes);
return NFS4ERR_SERVERFAULT;
}
if (!inline_xdr_u_int32_t(da_addr_body, &stripe_ind)) {
LogMajor(COMPONENT_PNFS,
"Failed to encode ds for the stripe: %"
PRIu32 ".", stripe_ind);
return NFS4ERR_SERVERFAULT;
}
if (!inline_xdr_u_int32_t(da_addr_body, &num_ds)) {
LogMajor(COMPONENT_PNFS,
"Failed to encode length of multipath_ds_list array: %u",
num_ds);
return NFS4ERR_SERVERFAULT;
}
memset(&host, 0, sizeof(fsal_multipath_member_t));
host.addr = ntohl(deviceid->device_id4);
host.port = 2049;
host.proto = 6;
nfs_status = FSAL_encode_v4_multipath(da_addr_body, 1, &host);
if (nfs_status != NFS4_OK) {
LogMajor(COMPONENT_PNFS,
"Failed to encode data server address");
return nfs_status;
}
/** @todo: Here information about Data-Server where file resides
* is only send from MDS.If that Data-Server is down then
* read or write will performed through MDS.
* Instead should we send the information about all
* the available data-servers, so that these fops will
* always performed through Data-Servers.
* (Like in replicated volume contains more than ONE DS)
*/
return NFS4_OK;
}
/**
* @brief Get list of available devices
*
* We do not support listing devices and just set EOF without doing
* anything.
*
* @param[in] export_pub Export handle
* @param[in] type Type of layout to get devices for
* @param[in] cb Function taking device ID halves
* @param[in,out] res In/out and output arguments of the function
*
* @return Valid error codes in RFC 5661, pp. 365-6.
*/
static nfsstat4 getdevicelist(struct fsal_export *export_pub, layouttype4 type,
void *opaque,
bool (*cb)(void *opaque, const uint64_t id),
struct fsal_getdevicelist_res *res)
{
res->eof = true;
return NFS4_OK;
}
void handle_ops_pnfs(struct fsal_obj_ops *ops)
{
ops->layoutget = pnfs_layout_get;
ops->layoutreturn = pnfs_layout_return;
ops->layoutcommit = pnfs_layout_commit;
}
void fsal_ops_pnfs(struct fsal_ops *ops)
{
ops->getdeviceinfo = getdeviceinfo;
ops->fs_da_addr_size = fs_da_addr_size;
}
void export_ops_pnfs(struct export_ops *ops)
{
ops->getdevicelist = getdevicelist;
ops->fs_layouttypes = fs_layouttypes;
ops->fs_layout_blocksize = fs_layout_blocksize;
ops->fs_maximum_segments = fs_maximum_segments;
ops->fs_loc_body_size = fs_loc_body_size;
}
/* *
* Calculates a hash value for a given string buffer
*/
uint32_t superfasthash(const unsigned char *data, uint32_t len)
{
uint32_t hash = len, tmp;
int32_t rem;
rem = len & 3;
len >>= 2;
/* Main loop */
for (; len > 0; len--) {
hash += get16bits(data);
tmp = (get16bits(data+2) << 11) ^ hash;
hash = (hash << 16) ^ tmp;
data += 2*sizeof(uint16_t);
hash += hash >> 11;
}
/* Handle end cases */
switch (rem) {
case 3:
hash += get16bits(data);
hash ^= hash << 16;
hash ^= data[sizeof(uint16_t)] << 18;
hash += hash >> 11;
break;
case 2:
hash += get16bits(data);
hash ^= hash << 11;
hash += hash >> 17;
break;
case 1:
hash += *data;
hash ^= hash << 10;
hash += hash >> 1;
}
/* Force "avalanching" of final 127 bits */
hash ^= hash << 3;
hash += hash >> 5;
hash ^= hash << 4;
hash += hash >> 17;
hash ^= hash << 25;
hash += hash >> 6;
return hash;
}
/**
* It will extract hostname from pathinfo.PATH_INFO_KEYS gives
* details about all the servers and path in that server where
* file resides.
* First it selects the DS based on distributed hashing, then
* with the help of some basic string manipulations, the hostname
* can be fetched from the pathinfo
*
* Returns zero and valid hostname on success
*/
int
select_ds(struct glfs_object *object, char *pathinfo, char *hostname,
size_t size)
{
/* Represents starting of each server in the list*/
const char posix[10] = "POSIX";
/* Array of pathinfo of available dses */
char *ds_path_info[MAX_DS_COUNT];
/* Key for hashing */
unsigned char key[16];
/* Starting of first brick path in the pathinfo */
char *tmp = NULL;
/* Stores starting of hostname */
char *start = NULL;
/* Stores ending of hostname */
char *end = NULL;
int ret = -1;
int i = 0;
/* counts no of available ds */
int no_of_ds = 0;
if (!pathinfo || !size)
goto out;
tmp = pathinfo;
while ((tmp = strstr(tmp, posix))) {
ds_path_info[no_of_ds] = tmp;
tmp++;
no_of_ds++;
/* *
* If no of dses reaches maxmium count, then
* perform load balance on current list
*/
if (no_of_ds == MAX_DS_COUNT)
break;
}
if (no_of_ds == 0) {
LogCrit(COMPONENT_PNFS,
"Invalid pathinfo(%s) attribute found while selecting DS.",
pathinfo);
goto out;
}
ret = glfs_h_extract_handle(object, key, GFAPI_HANDLE_LENGTH);
if (ret < 0)
goto out;
/* Pick DS from the list */
if (no_of_ds == 1)
ret = 0;
else
ret = superfasthash(key, 16) % no_of_ds;
start = strchr(ds_path_info[ret], ':');
if (!start)
goto out;
end = start + 1;
end = strchr(end, ':');
if (start == end)
goto out;
memset(hostname, 0, size);
while (++start != end)
hostname[i++] = *start;
ret = 0;
LogDebug(COMPONENT_PNFS, "hostname %s", hostname);
out:
return ret;
}
/*
* The data server address will be send from here
*
* The information about the first server present
* in the PATH_INFO_KEY will be returned, since
* entire file is consistent over the servers
* (Striped volumes are not considered right now)
*
* On success, returns zero with ip address of
* the server will be send
*/
int
glfs_get_ds_addr(struct glfs *fs, struct glfs_object *object, uint32_t *ds_addr)
{
int ret = 0;
char pathinfo[1024] = {0, };
char hostname[1024] = {0, };
struct addrinfo hints, *res;
struct in_addr addr = {0, };
const char *pathinfokey = "trusted.glusterfs.pathinfo";
ret = glfs_h_getxattrs(fs, object, pathinfokey, pathinfo, 1024);
LogDebug(COMPONENT_PNFS, "pathinfo %s", pathinfo);
ret = select_ds(object, pathinfo, hostname, sizeof(hostname));
if (ret) {
LogMajor(COMPONENT_PNFS, "No DS found");
goto out;
}
memset(&hints, 0, sizeof(hints));
hints.ai_socktype = SOCK_STREAM;
hints.ai_family = AF_INET;
ret = getaddrinfo(hostname, NULL, &hints, &res);
if (ret != 0) {
LogMajor(COMPONENT_PNFS, "error %d\n", ret);
goto out;
}
addr.s_addr = ((struct sockaddr_in *)(res->ai_addr))->sin_addr.s_addr;
LogDebug(COMPONENT_PNFS, "ip address : %s", inet_ntoa(addr));
freeaddrinfo(res);
out:
*ds_addr = addr.s_addr;
return ret;
}