From 422d2cb8f9afadba1ecd3614f658b6daaaa480fb Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 26 Feb 2010 15:32:31 -0800 Subject: ceph: reset osd after relevant messages timed out This simplifies the process of timing out messages. We keep lru of current messages that are in flight. If a timeout has passed, we reset the osd connection, so that messages will be retransmitted. This is a failsafe in case we hit some sort of problem sending out message to the OSD. Normally, we'll get notification via an updated osdmap if there are problems. If a request is older than the keepalive timeout, send a keepalive to ensure we detect any breaks in the TCP connection. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/osd_client.c | 153 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 98 insertions(+), 55 deletions(-) (limited to 'fs/ceph/osd_client.c') diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index c4763bff97b4..dbe63db9762f 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -17,6 +17,8 @@ #define OSD_OPREPLY_FRONT_LEN 512 const static struct ceph_connection_operations osd_con_ops; +static int __kick_requests(struct ceph_osd_client *osdc, + struct ceph_osd *kickosd); static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); @@ -339,6 +341,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) osd->o_con.ops = &osd_con_ops; osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD; + INIT_LIST_HEAD(&osd->o_keepalive_item); return osd; } @@ -461,6 +464,16 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) return NULL; } +static void __schedule_osd_timeout(struct ceph_osd_client *osdc) +{ + schedule_delayed_work(&osdc->timeout_work, + osdc->client->mount_args->osd_keepalive_timeout * HZ); +} + +static void __cancel_osd_timeout(struct ceph_osd_client *osdc) +{ + cancel_delayed_work(&osdc->timeout_work); +} /* * Register request, assign tid. If this is the first request, set up @@ -472,21 +485,16 @@ static void register_request(struct ceph_osd_client *osdc, mutex_lock(&osdc->request_mutex); req->r_tid = ++osdc->last_tid; req->r_request->hdr.tid = cpu_to_le64(req->r_tid); + INIT_LIST_HEAD(&req->r_req_lru_item); dout("register_request %p tid %lld\n", req, req->r_tid); __insert_request(osdc, req); ceph_osdc_get_request(req); osdc->num_requests++; - req->r_timeout_stamp = - jiffies + osdc->client->mount_args->osd_timeout*HZ; - if (osdc->num_requests == 1) { - osdc->timeout_tid = req->r_tid; - dout(" timeout on tid %llu at %lu\n", req->r_tid, - req->r_timeout_stamp); - schedule_delayed_work(&osdc->timeout_work, - round_jiffies_relative(req->r_timeout_stamp - jiffies)); + dout(" first request, scheduling timeout\n"); + __schedule_osd_timeout(osdc); } mutex_unlock(&osdc->request_mutex); } @@ -513,21 +521,10 @@ static void __unregister_request(struct ceph_osd_client *osdc, ceph_osdc_put_request(req); - if (req->r_tid == osdc->timeout_tid) { - if (osdc->num_requests == 0) { - dout("no requests, canceling timeout\n"); - osdc->timeout_tid = 0; - cancel_delayed_work(&osdc->timeout_work); - } else { - req = rb_entry(rb_first(&osdc->requests), - struct ceph_osd_request, r_node); - osdc->timeout_tid = req->r_tid; - dout("rescheduled timeout on tid %llu at %lu\n", - req->r_tid, req->r_timeout_stamp); - schedule_delayed_work(&osdc->timeout_work, - round_jiffies_relative(req->r_timeout_stamp - - jiffies)); - } + list_del_init(&req->r_req_lru_item); + if (osdc->num_requests == 0) { + dout(" no requests, canceling timeout\n"); + __cancel_osd_timeout(osdc); } } @@ -540,6 +537,7 @@ static void __cancel_request(struct ceph_osd_request *req) ceph_con_revoke(&req->r_osd->o_con, req->r_request); req->r_sent = 0; } + list_del_init(&req->r_req_lru_item); } /* @@ -635,7 +633,8 @@ static int __send_request(struct ceph_osd_client *osdc, reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ reqhead->reassert_version = req->r_reassert_version; - req->r_timeout_stamp = jiffies+osdc->client->mount_args->osd_timeout*HZ; + req->r_sent_stamp = jiffies; + list_move_tail(&osdc->req_lru, &req->r_req_lru_item); ceph_msg_get(req->r_request); /* send consumes a ref */ ceph_con_send(&req->r_osd->o_con, req->r_request); @@ -656,11 +655,14 @@ static void handle_timeout(struct work_struct *work) { struct ceph_osd_client *osdc = container_of(work, struct ceph_osd_client, timeout_work.work); - struct ceph_osd_request *req; + struct ceph_osd_request *req, *last_req = NULL; struct ceph_osd *osd; unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ; - unsigned long next_timeout = timeout + jiffies; + unsigned long keepalive = + osdc->client->mount_args->osd_keepalive_timeout * HZ; + unsigned long last_sent = 0; struct rb_node *p; + struct list_head slow_osds; dout("timeout\n"); down_read(&osdc->map_sem); @@ -683,25 +685,56 @@ static void handle_timeout(struct work_struct *work) continue; } } - for (p = rb_first(&osdc->osds); p; p = rb_next(p)) { - osd = rb_entry(p, struct ceph_osd, o_node); - if (list_empty(&osd->o_requests)) - continue; - req = list_first_entry(&osd->o_requests, - struct ceph_osd_request, r_osd_item); - if (time_before(jiffies, req->r_timeout_stamp)) - continue; - dout(" tid %llu (at least) timed out on osd%d\n", + /* + * reset osds that appear to be _really_ unresponsive. this + * is a failsafe measure.. we really shouldn't be getting to + * this point if the system is working properly. the monitors + * should mark the osd as failed and we should find out about + * it from an updated osd map. + */ + while (!list_empty(&osdc->req_lru)) { + req = list_entry(osdc->req_lru.next, struct ceph_osd_request, + r_req_lru_item); + + if (time_before(jiffies, req->r_sent_stamp + timeout)) + break; + + BUG_ON(req == last_req && req->r_sent_stamp == last_sent); + last_req = req; + last_sent = req->r_sent_stamp; + + osd = req->r_osd; + BUG_ON(!osd); + pr_warning(" tid %llu timed out on osd%d, will reset osd\n", + req->r_tid, osd->o_osd); + __kick_requests(osdc, osd); + } + + /* + * ping osds that are a bit slow. this ensures that if there + * is a break in the TCP connection we will notice, and reopen + * a connection with that osd (from the fault callback). + */ + INIT_LIST_HEAD(&slow_osds); + list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { + if (time_before(jiffies, req->r_sent_stamp + keepalive)) + break; + + osd = req->r_osd; + BUG_ON(!osd); + dout(" tid %llu is slow, will send keepalive on osd%d\n", req->r_tid, osd->o_osd); - req->r_timeout_stamp = next_timeout; + list_move_tail(&osd->o_keepalive_item, &slow_osds); + } + while (!list_empty(&slow_osds)) { + osd = list_entry(slow_osds.next, struct ceph_osd, + o_keepalive_item); + list_del_init(&osd->o_keepalive_item); ceph_con_keepalive(&osd->o_con); } - if (osdc->timeout_tid) - schedule_delayed_work(&osdc->timeout_work, - round_jiffies_relative(timeout)); - + __schedule_osd_timeout(osdc); mutex_unlock(&osdc->request_mutex); up_read(&osdc->map_sem); @@ -819,18 +852,7 @@ bad: } -/* - * Resubmit osd requests whose osd or osd address has changed. Request - * a new osd map if osds are down, or we are otherwise unable to determine - * how to direct a request. - * - * Close connections to down osds. - * - * If @who is specified, resubmit requests for that specific osd. - * - * Caller should hold map_sem for read and request_mutex. - */ -static void kick_requests(struct ceph_osd_client *osdc, +static int __kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *kickosd) { struct ceph_osd_request *req; @@ -839,7 +861,6 @@ static void kick_requests(struct ceph_osd_client *osdc, int err; dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1); - mutex_lock(&osdc->request_mutex); if (kickosd) { __reset_osd(osdc, kickosd); } else { @@ -900,14 +921,36 @@ kick: req->r_resend = true; } } + + return needmap; +} + +/* + * Resubmit osd requests whose osd or osd address has changed. Request + * a new osd map if osds are down, or we are otherwise unable to determine + * how to direct a request. + * + * Close connections to down osds. + * + * If @who is specified, resubmit requests for that specific osd. + * + * Caller should hold map_sem for read and request_mutex. + */ +static void kick_requests(struct ceph_osd_client *osdc, + struct ceph_osd *kickosd) +{ + int needmap; + + mutex_lock(&osdc->request_mutex); + needmap = __kick_requests(osdc, kickosd); mutex_unlock(&osdc->request_mutex); if (needmap) { dout("%d requests for down osds, need new map\n", needmap); ceph_monc_request_next_osdmap(&osdc->client->monc); } -} +} /* * Process updated osd map. * @@ -1164,11 +1207,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) init_completion(&osdc->map_waiters); osdc->last_requested_map = 0; mutex_init(&osdc->request_mutex); - osdc->timeout_tid = 0; osdc->last_tid = 0; osdc->osds = RB_ROOT; INIT_LIST_HEAD(&osdc->osd_lru); osdc->requests = RB_ROOT; + INIT_LIST_HEAD(&osdc->req_lru); osdc->num_requests = 0; INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); -- cgit v1.2.3