libceph: crush_location infrastructure

Allow expressing client's location in terms of CRUSH hierarchy as
a set of (bucket type name, bucket name) pairs.  The userspace syntax
"crush_location = key1=value1 key2=value2" is incompatible with mount
options and needed adaptation.  Key-value pairs are separated by '|'
and we use ':' instead of '=' to separate keys from values.  So for:

  crush_location = host=foo rack=bar

one would write:

  crush_location=host:foo|rack:bar

As in userspace, "multipath" locations are supported, so indicating
locality for parallel hierarchies is possible:

  crush_location=rack:foo1|rack:foo2|datacenter:bar

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
This commit is contained in:
Ilya Dryomov 2020-05-22 15:24:53 +02:00
parent 86403a92c3
commit 45e6aa9f55
4 changed files with 168 additions and 1 deletions

View file

@ -64,6 +64,7 @@ struct ceph_options {
int num_mon;
char *name;
struct ceph_crypto_key *key;
struct rb_root crush_locs;
};
/*

View file

@ -302,9 +302,23 @@ bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
const struct ceph_pg *raw_pgid);
struct crush_loc {
char *cl_type_name;
char *cl_name;
};
struct crush_loc_node {
struct rb_node cl_node;
struct crush_loc cl_loc; /* pointers into cl_data */
char cl_data[];
};
int ceph_parse_crush_location(char *crush_location, struct rb_root *locs);
int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2);
void ceph_clear_crush_locs(struct rb_root *locs);
extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
u64 id);
extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id);

View file

@ -176,6 +176,10 @@ int ceph_compare_options(struct ceph_options *new_opt,
}
}
ret = ceph_compare_crush_locs(&opt1->crush_locs, &opt2->crush_locs);
if (ret)
return ret;
/* any matching mon ip implies a match */
for (i = 0; i < opt1->num_mon; i++) {
if (ceph_monmap_contains(client->monc.monmap,
@ -260,6 +264,7 @@ enum {
Opt_secret,
Opt_key,
Opt_ip,
Opt_crush_location,
/* string args above */
Opt_share,
Opt_crc,
@ -274,6 +279,7 @@ static const struct fs_parameter_spec ceph_parameters[] = {
fsparam_flag_no ("cephx_require_signatures", Opt_cephx_require_signatures),
fsparam_flag_no ("cephx_sign_messages", Opt_cephx_sign_messages),
fsparam_flag_no ("crc", Opt_crc),
fsparam_string ("crush_location", Opt_crush_location),
fsparam_string ("fsid", Opt_fsid),
fsparam_string ("ip", Opt_ip),
fsparam_string ("key", Opt_key),
@ -298,6 +304,7 @@ struct ceph_options *ceph_alloc_options(void)
if (!opt)
return NULL;
opt->crush_locs = RB_ROOT;
opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
GFP_KERNEL);
if (!opt->mon_addr) {
@ -320,6 +327,7 @@ void ceph_destroy_options(struct ceph_options *opt)
if (!opt)
return;
ceph_clear_crush_locs(&opt->crush_locs);
kfree(opt->name);
if (opt->key) {
ceph_crypto_key_destroy(opt->key);
@ -454,6 +462,16 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
if (!opt->key)
return -ENOMEM;
return get_secret(opt->key, param->string, &log);
case Opt_crush_location:
ceph_clear_crush_locs(&opt->crush_locs);
err = ceph_parse_crush_location(param->string,
&opt->crush_locs);
if (err) {
error_plog(&log, "Failed to parse CRUSH location: %d",
err);
return err;
}
break;
case Opt_osdtimeout:
warn_plog(&log, "Ignoring osdtimeout");
@ -536,6 +554,7 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
{
struct ceph_options *opt = client->options;
size_t pos = m->count;
struct rb_node *n;
if (opt->name) {
seq_puts(m, "name=");
@ -545,6 +564,23 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
if (opt->key)
seq_puts(m, "secret=<hidden>,");
if (!RB_EMPTY_ROOT(&opt->crush_locs)) {
seq_puts(m, "crush_location=");
for (n = rb_first(&opt->crush_locs); ; ) {
struct crush_loc_node *loc =
rb_entry(n, struct crush_loc_node, cl_node);
seq_printf(m, "%s:%s", loc->cl_loc.cl_type_name,
loc->cl_loc.cl_name);
n = rb_next(n);
if (!n)
break;
seq_putc(m, '|');
}
seq_putc(m, ',');
}
if (opt->flags & CEPH_OPT_FSID)
seq_printf(m, "fsid=%pU,", &opt->fsid);
if (opt->flags & CEPH_OPT_NOSHARE)

View file

@ -2715,3 +2715,119 @@ int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
return acting.primary;
}
EXPORT_SYMBOL(ceph_pg_to_acting_primary);
static struct crush_loc_node *alloc_crush_loc(size_t type_name_len,
size_t name_len)
{
struct crush_loc_node *loc;
loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO);
if (!loc)
return NULL;
RB_CLEAR_NODE(&loc->cl_node);
return loc;
}
static void free_crush_loc(struct crush_loc_node *loc)
{
WARN_ON(!RB_EMPTY_NODE(&loc->cl_node));
kfree(loc);
}
static int crush_loc_compare(const struct crush_loc *loc1,
const struct crush_loc *loc2)
{
return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?:
strcmp(loc1->cl_name, loc2->cl_name);
}
DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare,
RB_BYPTR, const struct crush_loc *, cl_node)
/*
* Parses a set of <bucket type name>':'<bucket name> pairs separated
* by '|', e.g. "rack:foo1|rack:foo2|datacenter:bar".
*
* Note that @crush_location is modified by strsep().
*/
int ceph_parse_crush_location(char *crush_location, struct rb_root *locs)
{
struct crush_loc_node *loc;
const char *type_name, *name, *colon;
size_t type_name_len, name_len;
dout("%s '%s'\n", __func__, crush_location);
while ((type_name = strsep(&crush_location, "|"))) {
colon = strchr(type_name, ':');
if (!colon)
return -EINVAL;
type_name_len = colon - type_name;
if (type_name_len == 0)
return -EINVAL;
name = colon + 1;
name_len = strlen(name);
if (name_len == 0)
return -EINVAL;
loc = alloc_crush_loc(type_name_len, name_len);
if (!loc)
return -ENOMEM;
loc->cl_loc.cl_type_name = loc->cl_data;
memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len);
loc->cl_loc.cl_type_name[type_name_len] = '\0';
loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1;
memcpy(loc->cl_loc.cl_name, name, name_len);
loc->cl_loc.cl_name[name_len] = '\0';
if (!__insert_crush_loc(locs, loc)) {
free_crush_loc(loc);
return -EEXIST;
}
dout("%s type_name '%s' name '%s'\n", __func__,
loc->cl_loc.cl_type_name, loc->cl_loc.cl_name);
}
return 0;
}
int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2)
{
struct rb_node *n1 = rb_first(locs1);
struct rb_node *n2 = rb_first(locs2);
int ret;
for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) {
struct crush_loc_node *loc1 =
rb_entry(n1, struct crush_loc_node, cl_node);
struct crush_loc_node *loc2 =
rb_entry(n2, struct crush_loc_node, cl_node);
ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc);
if (ret)
return ret;
}
if (!n1 && n2)
return -1;
if (n1 && !n2)
return 1;
return 0;
}
void ceph_clear_crush_locs(struct rb_root *locs)
{
while (!RB_EMPTY_ROOT(locs)) {
struct crush_loc_node *loc =
rb_entry(rb_first(locs), struct crush_loc_node, cl_node);
erase_crush_loc(locs, loc);
free_crush_loc(loc);
}
}