qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

filter-rewriter.c (13730B)


      1 /*
      2  * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
      3  * Copyright (c) 2016 FUJITSU LIMITED
      4  * Copyright (c) 2016 Intel Corporation
      5  *
      6  * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
      7  *
      8  * This work is licensed under the terms of the GNU GPL, version 2 or
      9  * later.  See the COPYING file in the top-level directory.
     10  */
     11 
     12 #include "qemu/osdep.h"
     13 #include "trace.h"
     14 #include "colo.h"
     15 #include "net/filter.h"
     16 #include "net/net.h"
     17 #include "qemu/error-report.h"
     18 #include "qom/object.h"
     19 #include "qemu/main-loop.h"
     20 #include "qemu/iov.h"
     21 #include "net/checksum.h"
     22 #include "net/colo.h"
     23 #include "migration/colo.h"
     24 #include "util.h"
     25 
     26 #define TYPE_FILTER_REWRITER "filter-rewriter"
     27 OBJECT_DECLARE_SIMPLE_TYPE(RewriterState, FILTER_REWRITER)
     28 
     29 #define FAILOVER_MODE_ON  true
     30 #define FAILOVER_MODE_OFF false
     31 
     32 struct RewriterState {
     33     NetFilterState parent_obj;
     34     NetQueue *incoming_queue;
     35     /* hashtable to save connection */
     36     GHashTable *connection_track_table;
     37     bool vnet_hdr;
     38     bool failover_mode;
     39 };
     40 
     41 static void filter_rewriter_failover_mode(RewriterState *s)
     42 {
     43     s->failover_mode = FAILOVER_MODE_ON;
     44 }
     45 
     46 static void filter_rewriter_flush(NetFilterState *nf)
     47 {
     48     RewriterState *s = FILTER_REWRITER(nf);
     49 
     50     if (!qemu_net_queue_flush(s->incoming_queue)) {
     51         /* Unable to empty the queue, purge remaining packets */
     52         qemu_net_queue_purge(s->incoming_queue, nf->netdev);
     53     }
     54 }
     55 
     56 /*
     57  * Return 1 on success, if return 0 means the pkt
     58  * is not TCP packet
     59  */
     60 static int is_tcp_packet(Packet *pkt)
     61 {
     62     if (!parse_packet_early(pkt) &&
     63         pkt->ip->ip_p == IPPROTO_TCP) {
     64         return 1;
     65     } else {
     66         return 0;
     67     }
     68 }
     69 
     70 /* handle tcp packet from primary guest */
     71 static int handle_primary_tcp_pkt(RewriterState *rf,
     72                                   Connection *conn,
     73                                   Packet *pkt, ConnectionKey *key)
     74 {
     75     struct tcp_hdr *tcp_pkt;
     76 
     77     tcp_pkt = (struct tcp_hdr *)pkt->transport_header;
     78     if (trace_event_get_state_backends(TRACE_COLO_FILTER_REWRITER_PKT_INFO)) {
     79         trace_colo_filter_rewriter_pkt_info(__func__,
     80                     inet_ntoa(pkt->ip->ip_src), inet_ntoa(pkt->ip->ip_dst),
     81                     ntohl(tcp_pkt->th_seq), ntohl(tcp_pkt->th_ack),
     82                     tcp_pkt->th_flags);
     83     }
     84     if (trace_event_get_state_backends(
     85           TRACE_COLO_FILTER_REWRITER_CONN_OFFSET)) {
     86         trace_colo_filter_rewriter_conn_offset(conn->offset);
     87     }
     88 
     89     if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN)) &&
     90         conn->tcp_state == TCPS_SYN_SENT) {
     91         conn->tcp_state = TCPS_ESTABLISHED;
     92     }
     93 
     94     if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
     95         /*
     96          * we use this flag update offset func
     97          * run once in independent tcp connection
     98          */
     99         conn->tcp_state = TCPS_SYN_RECEIVED;
    100     }
    101 
    102     if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK)) {
    103         if (conn->tcp_state == TCPS_SYN_RECEIVED) {
    104             /*
    105              * offset = secondary_seq - primary seq
    106              * ack packet sent by guest from primary node,
    107              * so we use th_ack - 1 get primary_seq
    108              */
    109             conn->offset -= (ntohl(tcp_pkt->th_ack) - 1);
    110             conn->tcp_state = TCPS_ESTABLISHED;
    111         }
    112         if (conn->offset) {
    113             /* handle packets to the secondary from the primary */
    114             tcp_pkt->th_ack = htonl(ntohl(tcp_pkt->th_ack) + conn->offset);
    115 
    116             net_checksum_calculate((uint8_t *)pkt->data + pkt->vnet_hdr_len,
    117                                    pkt->size - pkt->vnet_hdr_len, CSUM_TCP);
    118         }
    119 
    120         /*
    121          * Passive close step 3
    122          */
    123         if ((conn->tcp_state == TCPS_LAST_ACK) &&
    124             (ntohl(tcp_pkt->th_ack) == (conn->fin_ack_seq + 1))) {
    125             conn->tcp_state = TCPS_CLOSED;
    126             g_hash_table_remove(rf->connection_track_table, key);
    127         }
    128     }
    129 
    130     if ((tcp_pkt->th_flags & TH_FIN) == TH_FIN) {
    131         /*
    132          * Passive close.
    133          * Step 1:
    134          * The *server* side of this connect is VM, *client* tries to close
    135          * the connection. We will into CLOSE_WAIT status.
    136          *
    137          * Step 2:
    138          * In this step we will into LAST_ACK status.
    139          *
    140          * We got 'fin=1, ack=1' packet from server side, we need to
    141          * record the seq of 'fin=1, ack=1' packet.
    142          *
    143          * Step 3:
    144          * We got 'ack=1' packets from client side, it acks 'fin=1, ack=1'
    145          * packet from server side. From this point, we can ensure that there
    146          * will be no packets in the connection, except that, some errors
    147          * happen between the path of 'filter object' and vNIC, if this rare
    148          * case really happen, we can still create a new connection,
    149          * So it is safe to remove the connection from connection_track_table.
    150          *
    151          */
    152         if (conn->tcp_state == TCPS_ESTABLISHED) {
    153             conn->tcp_state = TCPS_CLOSE_WAIT;
    154         }
    155 
    156         /*
    157          * Active close step 2.
    158          */
    159         if (conn->tcp_state == TCPS_FIN_WAIT_1) {
    160             /*
    161              * For simplify implementation, we needn't wait 2MSL time
    162              * in filter rewriter. Because guest kernel will track the
    163              * TCP status and wait 2MSL time, if client resend the FIN
    164              * packet, guest will apply the last ACK too.
    165              * So, we skip the TCPS_TIME_WAIT state here and go straight
    166              * to TCPS_CLOSED state.
    167              */
    168             conn->tcp_state = TCPS_CLOSED;
    169             g_hash_table_remove(rf->connection_track_table, key);
    170         }
    171     }
    172 
    173     return 0;
    174 }
    175 
    176 /* handle tcp packet from secondary guest */
    177 static int handle_secondary_tcp_pkt(RewriterState *rf,
    178                                     Connection *conn,
    179                                     Packet *pkt, ConnectionKey *key)
    180 {
    181     struct tcp_hdr *tcp_pkt;
    182 
    183     tcp_pkt = (struct tcp_hdr *)pkt->transport_header;
    184 
    185     if (trace_event_get_state_backends(TRACE_COLO_FILTER_REWRITER_PKT_INFO)) {
    186         trace_colo_filter_rewriter_pkt_info(__func__,
    187                     inet_ntoa(pkt->ip->ip_src), inet_ntoa(pkt->ip->ip_dst),
    188                     ntohl(tcp_pkt->th_seq), ntohl(tcp_pkt->th_ack),
    189                     tcp_pkt->th_flags);
    190     }
    191     if (trace_event_get_state_backends(
    192           TRACE_COLO_FILTER_REWRITER_CONN_OFFSET)) {
    193         trace_colo_filter_rewriter_conn_offset(conn->offset);
    194     }
    195 
    196     if (conn->tcp_state == TCPS_SYN_RECEIVED &&
    197         ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) {
    198         /*
    199          * save offset = secondary_seq and then
    200          * in handle_primary_tcp_pkt make offset
    201          * = secondary_seq - primary_seq
    202          */
    203         conn->offset = ntohl(tcp_pkt->th_seq);
    204     }
    205 
    206     /* VM active connect */
    207     if (conn->tcp_state == TCPS_CLOSED &&
    208         ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
    209         conn->tcp_state = TCPS_SYN_SENT;
    210     }
    211 
    212     if ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK) {
    213         /* Only need to adjust seq while offset is Non-zero */
    214         if (conn->offset) {
    215             /* handle packets to the primary from the secondary*/
    216             tcp_pkt->th_seq = htonl(ntohl(tcp_pkt->th_seq) - conn->offset);
    217 
    218             net_checksum_calculate((uint8_t *)pkt->data + pkt->vnet_hdr_len,
    219                                    pkt->size - pkt->vnet_hdr_len, CSUM_TCP);
    220         }
    221     }
    222 
    223     /*
    224      * Passive close step 2:
    225      */
    226     if (conn->tcp_state == TCPS_CLOSE_WAIT &&
    227         (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == (TH_ACK | TH_FIN)) {
    228         conn->fin_ack_seq = ntohl(tcp_pkt->th_seq);
    229         conn->tcp_state = TCPS_LAST_ACK;
    230     }
    231 
    232     /*
    233      * Active close
    234      *
    235      * Step 1:
    236      * The *server* side of this connect is VM, *server* tries to close
    237      * the connection.
    238      *
    239      * Step 2:
    240      * We will into CLOSE_WAIT status.
    241      * We simplify the TCPS_FIN_WAIT_2, TCPS_TIME_WAIT and
    242      * CLOSING status.
    243      */
    244     if (conn->tcp_state == TCPS_ESTABLISHED &&
    245         (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == TH_FIN) {
    246         conn->tcp_state = TCPS_FIN_WAIT_1;
    247     }
    248 
    249     return 0;
    250 }
    251 
    252 static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
    253                                          NetClientState *sender,
    254                                          unsigned flags,
    255                                          const struct iovec *iov,
    256                                          int iovcnt,
    257                                          NetPacketSent *sent_cb)
    258 {
    259     RewriterState *s = FILTER_REWRITER(nf);
    260     Connection *conn;
    261     ConnectionKey key;
    262     Packet *pkt;
    263     ssize_t size = iov_size(iov, iovcnt);
    264     ssize_t vnet_hdr_len = 0;
    265     char *buf = g_malloc0(size);
    266 
    267     iov_to_buf(iov, iovcnt, 0, buf, size);
    268 
    269     if (s->vnet_hdr) {
    270         vnet_hdr_len = nf->netdev->vnet_hdr_len;
    271     }
    272 
    273     pkt = packet_new_nocopy(buf, size, vnet_hdr_len);
    274 
    275     /*
    276      * if we get tcp packet
    277      * we will rewrite it to make secondary guest's
    278      * connection established successfully
    279      */
    280     if (pkt && is_tcp_packet(pkt)) {
    281 
    282         fill_connection_key(pkt, &key, sender == nf->netdev);
    283 
    284         /* After failover we needn't change new TCP packet */
    285         if (s->failover_mode &&
    286             !connection_has_tracked(s->connection_track_table, &key)) {
    287             goto out;
    288         }
    289 
    290         conn = connection_get(s->connection_track_table,
    291                               &key,
    292                               NULL);
    293 
    294         if (sender == nf->netdev) {
    295             /* NET_FILTER_DIRECTION_TX */
    296             if (!handle_primary_tcp_pkt(s, conn, pkt, &key)) {
    297                 qemu_net_queue_send(s->incoming_queue, sender, 0,
    298                 (const uint8_t *)pkt->data, pkt->size, NULL);
    299                 packet_destroy(pkt, NULL);
    300                 pkt = NULL;
    301                 /*
    302                  * We block the packet here,after rewrite pkt
    303                  * and will send it
    304                  */
    305                 return 1;
    306             }
    307         } else {
    308             /* NET_FILTER_DIRECTION_RX */
    309             if (!handle_secondary_tcp_pkt(s, conn, pkt, &key)) {
    310                 qemu_net_queue_send(s->incoming_queue, sender, 0,
    311                 (const uint8_t *)pkt->data, pkt->size, NULL);
    312                 packet_destroy(pkt, NULL);
    313                 pkt = NULL;
    314                 /*
    315                  * We block the packet here,after rewrite pkt
    316                  * and will send it
    317                  */
    318                 return 1;
    319             }
    320         }
    321     }
    322 
    323 out:
    324     packet_destroy(pkt, NULL);
    325     pkt = NULL;
    326     return 0;
    327 }
    328 
    329 static void reset_seq_offset(gpointer key, gpointer value, gpointer user_data)
    330 {
    331     Connection *conn = (Connection *)value;
    332 
    333     conn->offset = 0;
    334 }
    335 
    336 static gboolean offset_is_nonzero(gpointer key,
    337                                   gpointer value,
    338                                   gpointer user_data)
    339 {
    340     Connection *conn = (Connection *)value;
    341 
    342     return conn->offset ? true : false;
    343 }
    344 
    345 static void colo_rewriter_handle_event(NetFilterState *nf, int event,
    346                                        Error **errp)
    347 {
    348     RewriterState *rs = FILTER_REWRITER(nf);
    349 
    350     switch (event) {
    351     case COLO_EVENT_CHECKPOINT:
    352         g_hash_table_foreach(rs->connection_track_table,
    353                             reset_seq_offset, NULL);
    354         break;
    355     case COLO_EVENT_FAILOVER:
    356         if (!g_hash_table_find(rs->connection_track_table,
    357                               offset_is_nonzero, NULL)) {
    358             filter_rewriter_failover_mode(rs);
    359         }
    360         break;
    361     default:
    362         break;
    363     }
    364 }
    365 
    366 static void colo_rewriter_cleanup(NetFilterState *nf)
    367 {
    368     RewriterState *s = FILTER_REWRITER(nf);
    369 
    370     /* flush packets */
    371     if (s->incoming_queue) {
    372         filter_rewriter_flush(nf);
    373         g_free(s->incoming_queue);
    374     }
    375 
    376     g_hash_table_destroy(s->connection_track_table);
    377 }
    378 
    379 static void colo_rewriter_setup(NetFilterState *nf, Error **errp)
    380 {
    381     RewriterState *s = FILTER_REWRITER(nf);
    382 
    383     s->connection_track_table = g_hash_table_new_full(connection_key_hash,
    384                                                       connection_key_equal,
    385                                                       g_free,
    386                                                       NULL);
    387     s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf);
    388 }
    389 
    390 static bool filter_rewriter_get_vnet_hdr(Object *obj, Error **errp)
    391 {
    392     RewriterState *s = FILTER_REWRITER(obj);
    393 
    394     return s->vnet_hdr;
    395 }
    396 
    397 static void filter_rewriter_set_vnet_hdr(Object *obj,
    398                                          bool value,
    399                                          Error **errp)
    400 {
    401     RewriterState *s = FILTER_REWRITER(obj);
    402 
    403     s->vnet_hdr = value;
    404 }
    405 
    406 static void filter_rewriter_init(Object *obj)
    407 {
    408     RewriterState *s = FILTER_REWRITER(obj);
    409 
    410     s->vnet_hdr = false;
    411     s->failover_mode = FAILOVER_MODE_OFF;
    412 }
    413 
    414 static void colo_rewriter_class_init(ObjectClass *oc, void *data)
    415 {
    416     NetFilterClass *nfc = NETFILTER_CLASS(oc);
    417 
    418     object_class_property_add_bool(oc, "vnet_hdr_support",
    419                                    filter_rewriter_get_vnet_hdr,
    420                                    filter_rewriter_set_vnet_hdr);
    421 
    422     nfc->setup = colo_rewriter_setup;
    423     nfc->cleanup = colo_rewriter_cleanup;
    424     nfc->receive_iov = colo_rewriter_receive_iov;
    425     nfc->handle_event = colo_rewriter_handle_event;
    426 }
    427 
    428 static const TypeInfo colo_rewriter_info = {
    429     .name = TYPE_FILTER_REWRITER,
    430     .parent = TYPE_NETFILTER,
    431     .class_init = colo_rewriter_class_init,
    432     .instance_init = filter_rewriter_init,
    433     .instance_size = sizeof(RewriterState),
    434 };
    435 
    436 static void register_types(void)
    437 {
    438     type_register_static(&colo_rewriter_info);
    439 }
    440 
    441 type_init(register_types);