aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorOliver Smith <osmith@sysmocom.de>2022-11-21 10:25:26 +0100
committerKeith Whyte <keith@rhizomatica.org>2024-04-04 23:42:01 +0100
commit407cebf8f5a3553438fc8996804950569dc19e42 (patch)
tree37dc155aeb66e591ffb8071ce39e72254f0313f7
parentac1085c06a33e38ff7c5cebedc22a13e41ab3acd (diff)
recover BORKEN lchans for missing ACK scenariosrhizomatica/testing
We already recover broken lchans where an ACTIV ACK or REL ACK arrives late. Now add a recovery path for lchans that are broken because no ACTIV ACK or REL ACK arrives at all. Add a timeout of X28 = 30s to the lchan BORKEN state. On timeout, attempt both a Channel Activation and a Channel Release. If any of them is ACKed, we have successfully synced BTS and BSC's state. After successful recovery, place the lchan back in the UNUSED state, available for servicing subscribers. If recovery is unsuccessful, just continue to attempt recovery every further X28 seconds. Patch-by: osmith, nhofmeyr Related: osmo-ttcn3-hacks I9b4ddfc4a337808d9d5ec538c25fd390b1b2530f Related: OS#5106 Related: SYS#6655 Change-Id: Ic4728b3efe843ea63e2a0b54b1ea8a925347484a
-rw-r--r--doc/lchan-fsm.dot10
-rw-r--r--include/osmocom/bsc/lchan_fsm.h2
-rw-r--r--src/osmo-bsc/lchan_fsm.c100
-rw-r--r--src/osmo-bsc/net_init.c1
-rw-r--r--tests/timer.vty2
5 files changed, 115 insertions, 0 deletions
diff --git a/doc/lchan-fsm.dot b/doc/lchan-fsm.dot
index fe35903f7..82ef922f3 100644
--- a/doc/lchan-fsm.dot
+++ b/doc/lchan-fsm.dot
@@ -32,6 +32,7 @@ labelloc=t; label="lchan FSM"
WAIT_TS_READY -> UNUSED [label="error/timeout",style=dashed,constraint=false]
{WAIT_ACTIV_ACK,WAIT_RF_RELEASE_ACK} -> BORKEN [label="error/timeout",style=dashed]
BORKEN -> WAIT_AFTER_ERROR [label="late RF Release ACK"]
+ BORKEN -> WAIT_RF_RELEASE_ACK [label="late Activation ACK"]
WAIT_RLL_RTP_ESTABLISH -> WAIT_RLL_RTP_RELEASED [label=error,style=dashed]
WAIT_ACTIV_ACK -> rtp [label="LCHAN_RTP_EV_LCHAN_READY",style=dotted]
@@ -44,4 +45,13 @@ labelloc=t; label="lchan FSM"
WAIT_RSL_CHAN_MODE_MODIFY_ACK -> ESTABLISHED [label="LCHAN_EV_RSL_CHAN_MODE_MODIFY_ACK\nno change to RTP"]
WAIT_RR_CHAN_MODE_MODIFY_ACK -> BORKEN [label="error/timeout",style=dashed]
WAIT_RSL_CHAN_MODE_MODIFY_ACK -> BORKEN [label="error/timeout",style=dashed]
+
+ BORKEN -> RECOVER_WAIT_ACTIV_ACK [label="X28"]
+ RECOVER_WAIT_ACTIV_ACK -> BORKEN [label="error/timeout",style=dashed]
+
+ RECOVER_WAIT_ACTIV_ACK -> UNUSED [label="rx ACK"]
+ RECOVER_WAIT_ACTIV_ACK -> RECOVER_WAIT_RF_RELEASE_ACK [label="rx NACK"]
+
+ RECOVER_WAIT_RF_RELEASE_ACK -> UNUSED [label="rx ACK"]
+ RECOVER_WAIT_RF_RELEASE_ACK -> BORKEN [label="error/timeout",style=dashed]
}
diff --git a/include/osmocom/bsc/lchan_fsm.h b/include/osmocom/bsc/lchan_fsm.h
index cf9f20f61..3c7bbc143 100644
--- a/include/osmocom/bsc/lchan_fsm.h
+++ b/include/osmocom/bsc/lchan_fsm.h
@@ -33,6 +33,8 @@ enum lchan_fsm_state {
LCHAN_ST_WAIT_RF_RELEASE_ACK,
LCHAN_ST_WAIT_AFTER_ERROR,
LCHAN_ST_BORKEN,
+ LCHAN_ST_RECOVER_WAIT_ACTIV_ACK, /*< Attempt to recover from BORKEN: first try to activate the lchan */
+ LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK, /*< Attempt to recover from BORKEN: then try to release it */
};
enum lchan_fsm_event {
diff --git a/src/osmo-bsc/lchan_fsm.c b/src/osmo-bsc/lchan_fsm.c
index d6dfe3ac7..a663326d4 100644
--- a/src/osmo-bsc/lchan_fsm.c
+++ b/src/osmo-bsc/lchan_fsm.c
@@ -334,6 +334,9 @@ struct osmo_tdef_state_timeout lchan_fsm_timeouts[32] = {
[LCHAN_ST_WAIT_AFTER_ERROR] = { .T = -3111 },
[LCHAN_ST_WAIT_RR_CHAN_MODE_MODIFY_ACK] = { .T = -13 },
[LCHAN_ST_WAIT_RSL_CHAN_MODE_MODIFY_ACK] = { .T = -14 },
+ [LCHAN_ST_BORKEN] = { .T = -28 },
+ [LCHAN_ST_RECOVER_WAIT_ACTIV_ACK] = { .T = -6 },
+ [LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK] = { .T = -6 },
};
/* Transition to a state, using the T timer defined in lchan_fsm_timeouts.
@@ -380,6 +383,8 @@ uint32_t lchan_fsm_on_error[32] = {
[LCHAN_ST_BORKEN] = LCHAN_ST_BORKEN,
[LCHAN_ST_WAIT_RR_CHAN_MODE_MODIFY_ACK] = LCHAN_ST_WAIT_RF_RELEASE_ACK,
[LCHAN_ST_WAIT_RSL_CHAN_MODE_MODIFY_ACK] = LCHAN_ST_WAIT_RF_RELEASE_ACK,
+ [LCHAN_ST_RECOVER_WAIT_ACTIV_ACK] = LCHAN_ST_BORKEN,
+ [LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK] = LCHAN_ST_BORKEN,
};
#define lchan_fail(fmt, args...) lchan_fail_to(lchan_fsm_on_error[fi->state], fmt, ## args)
@@ -1631,6 +1636,71 @@ static void lchan_fsm_borken(struct osmo_fsm_inst *fi, uint32_t event, void *dat
}
}
+static void lchan_fsm_recover_wait_activ_ack_onenter(struct osmo_fsm_inst *fi, uint32_t prev_state)
+{
+ int rc;
+ struct gsm_lchan *lchan = lchan_fi_lchan(fi);
+
+ LOG_LCHAN(lchan, LOGL_INFO, "attempting to recover from BORKEN lchan\n");
+
+ lchan->type = GSM_LCHAN_SDCCH;
+ lchan->activate.info.ta_known = true;
+
+ chan_counts_ts_update(lchan->ts);
+
+ rc = rsl_tx_chan_activ(lchan, RSL_ACT_INTRA_NORM_ASS, 0);
+ if (rc)
+ lchan_fail("Tx Chan Activ failed: %s (%d)", strerror(-rc), rc);
+}
+
+static void lchan_fsm_recover_wait_activ_ack(struct osmo_fsm_inst *fi, uint32_t event, void *data)
+{
+ struct gsm_lchan *lchan = lchan_fi_lchan(fi);
+
+ switch (event) {
+
+ case LCHAN_EV_RSL_CHAN_ACTIV_ACK:
+ lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK);
+ break;
+
+ case LCHAN_EV_RSL_CHAN_ACTIV_NACK:
+ /* If an earlier lchan activ got through to the BTS, but the
+ * ACK did not get back to the BSC, it may still be active on
+ * the BTS side. Proceed to release it. */
+ LOG_LCHAN(lchan, LOGL_NOTICE, "received NACK for activation of BORKEN lchan, assuming still active\n");
+ lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK);
+ break;
+
+ default:
+ OSMO_ASSERT(false);
+ }
+}
+
+static void lchan_fsm_recover_wait_rf_release_ack_onenter(struct osmo_fsm_inst *fi, uint32_t prev_state)
+{
+ int rc;
+ struct gsm_lchan *lchan = lchan_fi_lchan(fi);
+
+ rc = rsl_tx_rf_chan_release(lchan);
+ if (rc)
+ lchan_fail("Tx RSL RF Channel Release failed: %s (%d)\n", strerror(-rc), rc);
+}
+
+static void lchan_fsm_recover_wait_rf_release_ack(struct osmo_fsm_inst *fi, uint32_t event, void *data)
+{
+ struct gsm_lchan *lchan = lchan_fi_lchan(fi);
+ switch (event) {
+
+ case LCHAN_EV_RSL_RF_CHAN_REL_ACK:
+ LOG_LCHAN(lchan, LOGL_NOTICE, "successfully recovered BORKEN lchan\n");
+ lchan_fsm_state_chg(LCHAN_ST_UNUSED);
+ break;
+
+ default:
+ OSMO_ASSERT(false);
+ }
+}
+
#define S(x) (1 << (x))
static const struct osmo_fsm_state lchan_fsm_states[] = {
@@ -1820,6 +1890,32 @@ static const struct osmo_fsm_state lchan_fsm_states[] = {
| S(LCHAN_ST_WAIT_RF_RELEASE_ACK)
| S(LCHAN_ST_UNUSED)
| S(LCHAN_ST_WAIT_AFTER_ERROR)
+ | S(LCHAN_ST_RECOVER_WAIT_ACTIV_ACK)
+ ,
+ },
+ [LCHAN_ST_RECOVER_WAIT_ACTIV_ACK] {
+ .name = "RECOVER_WAIT_ACTIV_ACK",
+ .onenter = lchan_fsm_recover_wait_activ_ack_onenter,
+ .action = lchan_fsm_recover_wait_activ_ack,
+ .in_event_mask = 0
+ | S(LCHAN_EV_RSL_CHAN_ACTIV_ACK)
+ | S(LCHAN_EV_RSL_CHAN_ACTIV_NACK)
+ ,
+ .out_state_mask = 0
+ | S(LCHAN_ST_BORKEN)
+ | S(LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK)
+ ,
+ },
+ [LCHAN_ST_RECOVER_WAIT_RF_RELEASE_ACK] {
+ .name = "RECOVER_WAIT_RF_RELEASE_ACK",
+ .onenter = lchan_fsm_recover_wait_rf_release_ack_onenter,
+ .action = lchan_fsm_recover_wait_rf_release_ack,
+ .in_event_mask = 0
+ | S(LCHAN_EV_RSL_RF_CHAN_REL_ACK)
+ ,
+ .out_state_mask = 0
+ | S(LCHAN_ST_BORKEN)
+ | S(LCHAN_ST_UNUSED)
,
},
};
@@ -1893,6 +1989,10 @@ static int lchan_fsm_timer_cb(struct osmo_fsm_inst *fi)
lchan_fsm_state_chg(LCHAN_ST_UNUSED);
return 0;
+ case LCHAN_ST_BORKEN:
+ lchan_fsm_state_chg(LCHAN_ST_RECOVER_WAIT_ACTIV_ACK);
+ return 0;
+
default:
lchan->release.in_error = true;
lchan->release.rsl_error_cause = RSL_ERR_INTERWORKING;
diff --git a/src/osmo-bsc/net_init.c b/src/osmo-bsc/net_init.c
index 258e91410..9142c73e1 100644
--- a/src/osmo-bsc/net_init.c
+++ b/src/osmo-bsc/net_init.c
@@ -74,6 +74,7 @@ static struct osmo_tdef gsm_network_T_defs[] = {
" after this amount of idle time, forget internally cumulated time remainders. Zero to always"
" keep remainders. See also X16, X17." },
{ .T = -25, .default_val = 5, .desc = "Timeout for initial user data after an MSC initiated an SCCP connection to the BSS" },
+ { .T = -28, .default_val = 30, .desc = "Interval at which to try to recover a BORKEN lchan" },
{ .T = -3105, .default_val = GSM_NY1_DEFAULT, .unit = OSMO_TDEF_CUSTOM,
.desc = "Ny1: Maximum number of Physical Information (re)transmissions" },
{ .T = -3111, .default_val = 4, .desc = "Wait time after lchan was released in error (should be T3111 + 2s)" },
diff --git a/tests/timer.vty b/tests/timer.vty
index e41805412..9b1843927 100644
--- a/tests/timer.vty
+++ b/tests/timer.vty
@@ -34,6 +34,7 @@ net: X16 = 1000 ms Granularity for all_allocated:* rate counters: amount of mill
net: X17 = 0 ms Rounding threshold for all_allocated:* rate counters: round up to the next counter increment after this many milliseconds. If set to half of X16 (or 0), employ the usual round() behavior: round up after half of a granularity period. If set to 1, behave like ceil(): already increment the counter immediately when all channels are allocated. If set >= X16, behave like floor(): only increment after a full X16 period of all channels being occupied. See also X16, X18 (default: 0 ms)
net: X18 = 60000 ms Forget-sum period for all_allocated:* rate counters: after this amount of idle time, forget internally cumulated time remainders. Zero to always keep remainders. See also X16, X17. (default: 60000 ms)
net: X25 = 5 s Timeout for initial user data after an MSC initiated an SCCP connection to the BSS (default: 5 s)
+net: X28 = 30 s Interval at which to try to recover a BORKEN lchan (default: 30 s)
net: X3105 = 17 Ny1: Maximum number of Physical Information (re)transmissions (default: 17)
net: X3111 = 4 s Wait time after lchan was released in error (should be T3111 + 2s) (default: 4 s)
net: X3113 = 60 s Maximum Paging Request Transmit Delay Threshold: If the estimated transmit delay of the messages in the paging queue surpasses this threshold, then new incoming paging requests will if possible replace a request in retransmission state from the queue or otherwise be discarded, hence limiting the size of the queue and maximum delay of its scheduled requests. X3113 also serves as the upper boundary for dynamic T3113 when estimating the expected maximum delay to get a response (default: 60 s)
@@ -90,6 +91,7 @@ net: X16 = 1000 ms Granularity for all_allocated:* rate counters: amount of mill
net: X17 = 0 ms Rounding threshold for all_allocated:* rate counters: round up to the next counter increment after this many milliseconds. If set to half of X16 (or 0), employ the usual round() behavior: round up after half of a granularity period. If set to 1, behave like ceil(): already increment the counter immediately when all channels are allocated. If set >= X16, behave like floor(): only increment after a full X16 period of all channels being occupied. See also X16, X18 (default: 0 ms)
net: X18 = 60000 ms Forget-sum period for all_allocated:* rate counters: after this amount of idle time, forget internally cumulated time remainders. Zero to always keep remainders. See also X16, X17. (default: 60000 ms)
net: X25 = 5 s Timeout for initial user data after an MSC initiated an SCCP connection to the BSS (default: 5 s)
+net: X28 = 30 s Interval at which to try to recover a BORKEN lchan (default: 30 s)
net: X3105 = 17 Ny1: Maximum number of Physical Information (re)transmissions (default: 17)
net: X3111 = 4 s Wait time after lchan was released in error (should be T3111 + 2s) (default: 4 s)
net: X3113 = 60 s Maximum Paging Request Transmit Delay Threshold: If the estimated transmit delay of the messages in the paging queue surpasses this threshold, then new incoming paging requests will if possible replace a request in retransmission state from the queue or otherwise be discarded, hence limiting the size of the queue and maximum delay of its scheduled requests. X3113 also serves as the upper boundary for dynamic T3113 when estimating the expected maximum delay to get a response (default: 60 s)