diff --git a/projects/wireguard/README.md b/projects/wireguard/README.md index 33bdc8890..07122bc03 100644 --- a/projects/wireguard/README.md +++ b/projects/wireguard/README.md @@ -1,16 +1,16 @@ # WireGuard -[WireGuard](https://wireguard.io) is a modern VPN released for the Linux kernel that can replace IPSec. +[WireGuard](https://www.wireguard.com) is a modern VPN released for the Linux kernel that can replace IPSec. We can use WireGuard in Moby to better secure container networking. WireGuard transparently encrypts *and* authenticates traffic between all peers, and uses state-of-the-art cryptography -from the [Noise protocol](http://www.noiseprotocol.org/). Moreover, WireGuard is implemented in less than a few thousand +from the [Noise protocol](https://noiseprotocol.org/). Moreover, WireGuard is implemented in less than a few thousand lines of code, making it auditable for security. Moreover, WireGuard provides a `wg0` (`wg1`, `wg2`,... etc) network interface that can be passed directly to containers, such that all intercontainer traffic would benefit from encrypted and authenticated networking. -A full technical paper from NDSS 2017 is available [here](https://www.wireguard.io/papers/wireguard.pdf). +A full technical paper from NDSS 2017 is available [here](https://www.wireguard.com/papers/wireguard.pdf). The protocol has been formally verified, with a paper describing the security proofs available [here](https://www.wireguard.com/papers/wireguard-formal-verification.pdf). ## Contents @@ -27,8 +27,8 @@ This is built into the `mobylinux/init-wireguard` image that is generated by `cd ## Quickstart The quickest way to get started is to use the provided `examples/wireguard.yml` in this directory and use the prebuilt images. -To give WireGuard a spin, the [official quick start](https://www.wireguard.io/quickstart/) is a good way to get going. For containers, -WireGuard has a [network namespace integration](https://www.wireguard.io/netns/) that we could use for Moby's containers. +To give WireGuard a spin, the [official quick start](https://www.wireguard.com/quickstart/) is a good way to get going. For containers, +WireGuard has a [network namespace integration](https://www.wireguard.com/netns/) that we could use for Moby's containers. ## Roadmap diff --git a/projects/wireguard/kernel/patches-4.9.x/WireGuard.patch b/projects/wireguard/kernel/patches-4.9.x/WireGuard.patch index 5dad530bd..77cb291ab 100644 --- a/projects/wireguard/kernel/patches-4.9.x/WireGuard.patch +++ b/projects/wireguard/kernel/patches-4.9.x/WireGuard.patch @@ -1,6 +1,6 @@ ---- /dev/null -+++ b/net/wireguard/config.c -@@ -0,0 +1,321 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/config.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,353 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#include "config.h" @@ -12,19 +12,15 @@ +#include "peer.h" +#include "uapi.h" + -+static int clear_peer_endpoint_src(struct wireguard_peer *peer, void *data) -+{ -+ socket_clear_peer_endpoint_src(peer); -+ return 0; -+} -+ +static int set_device_port(struct wireguard_device *wg, u16 port) +{ ++ struct wireguard_peer *peer, *temp; + socket_uninit(wg); + wg->incoming_port = port; + if (!(netdev_pub(wg)->flags & IFF_UP)) + return 0; -+ peer_for_each_unlocked(wg, clear_peer_endpoint_src, NULL); ++ peer_for_each (wg, peer, temp, false) ++ socket_clear_peer_endpoint_src(peer); + return socket_init(wg); +} + @@ -63,8 +59,20 @@ + peer = pubkey_hashtable_lookup(&wg->peer_hashtable, in_peer.public_key); + if (!peer) { /* Peer doesn't exist yet. Add a new one. */ + if (in_peer.flags & WGPEER_REMOVE_ME) -+ return -ENODEV; /* Tried to remove a non existing peer. */ -+ peer = peer_rcu_get(peer_create(wg, in_peer.public_key)); ++ return -ENODEV; /* Tried to remove a non-existing peer. */ ++ if (in_peer.flags & WGPEER_REMOVE_PRESHARED_KEY) ++ return -EINVAL; /* Tried to remove a psk for a non-existing peer. */ ++ ++ down_read(&wg->static_identity.lock); ++ if (wg->static_identity.has_identity && !memcmp(in_peer.public_key, wg->static_identity.static_public, NOISE_PUBLIC_KEY_LEN)) { ++ /* We silently ignore peers that have the same public key as the device. The reason we do it silently ++ * is that we'd like for people to be able to reuse the same set of API calls across peers. */ ++ up_read(&wg->static_identity.lock); ++ goto out; ++ } ++ up_read(&wg->static_identity.lock); ++ ++ peer = peer_rcu_get(peer_create(wg, in_peer.public_key, in_peer.preshared_key)); + if (!peer) + return -ENOMEM; + if (netdev_pub(wg)->flags & IFF_UP) @@ -77,8 +85,18 @@ + goto out; + } + ++ if (in_peer.flags & WGPEER_REMOVE_PRESHARED_KEY) { ++ down_write(&peer->handshake.lock); ++ memset(&peer->handshake.preshared_key, 0, NOISE_SYMMETRIC_KEY_LEN); ++ up_write(&peer->handshake.lock); ++ } else if (memcmp(zeros, in_peer.preshared_key, WG_KEY_LEN)) { ++ down_write(&peer->handshake.lock); ++ memcpy(&peer->handshake.preshared_key, in_peer.preshared_key, NOISE_SYMMETRIC_KEY_LEN); ++ up_write(&peer->handshake.lock); ++ } ++ + if (in_peer.endpoint.addr.sa_family == AF_INET || in_peer.endpoint.addr.sa_family == AF_INET6) { -+ struct endpoint endpoint = { 0 }; ++ struct endpoint endpoint = { { { 0 } } }; + memcpy(&endpoint, &in_peer.endpoint, sizeof(in_peer.endpoint)); + socket_set_peer_endpoint(peer, &endpoint); + } @@ -112,8 +130,9 @@ + +int config_set_device(struct wireguard_device *wg, void __user *user_device) +{ -+ int ret = 0; ++ int ret; + size_t i, offset; ++ struct wireguard_peer *peer, *temp; + struct wgdevice in_device; + void __user *user_peer; + bool modified_static_identity = false; @@ -123,14 +142,18 @@ + + mutex_lock(&wg->device_update_lock); + -+ if (copy_from_user(&in_device, user_device, sizeof(in_device))) { -+ ret = -EFAULT; ++ ret = -EFAULT; ++ if (copy_from_user(&in_device, user_device, sizeof(in_device))) ++ goto out; ++ ++ ret = -EPROTO; ++ if (in_device.version_magic != WG_API_VERSION_MAGIC) + goto out; -+ } + + if (in_device.fwmark || (!in_device.fwmark && (in_device.flags & WGDEVICE_REMOVE_FWMARK))) { + wg->fwmark = in_device.fwmark; -+ peer_for_each_unlocked(wg, clear_peer_endpoint_src, NULL); ++ peer_for_each (wg, peer, temp, false) ++ socket_clear_peer_endpoint_src(peer); + } + + if (in_device.port) { @@ -146,27 +169,34 @@ + noise_set_static_identity_private_key(&wg->static_identity, NULL); + modified_static_identity = true; + } else if (memcmp(zeros, in_device.private_key, WG_KEY_LEN)) { ++ u8 public_key[NOISE_PUBLIC_KEY_LEN] = { 0 }; ++ struct wireguard_peer *peer; ++ /* We remove before setting, to prevent race, which means doing two 25519-genpub ops. */ ++ bool unused __attribute((unused)) = curve25519_generate_public(public_key, in_device.private_key); ++ peer = pubkey_hashtable_lookup(&wg->peer_hashtable, public_key); ++ if (peer) { ++ peer_put(peer); ++ peer_remove(peer); ++ } ++ + noise_set_static_identity_private_key(&wg->static_identity, in_device.private_key); + modified_static_identity = true; + } + -+ if (in_device.flags & WGDEVICE_REMOVE_PRESHARED_KEY) { -+ noise_set_static_identity_preshared_key(&wg->static_identity, NULL); -+ modified_static_identity = true; -+ } else if (memcmp(zeros, in_device.preshared_key, WG_KEY_LEN)) { -+ noise_set_static_identity_preshared_key(&wg->static_identity, in_device.preshared_key); -+ modified_static_identity = true; ++ if (modified_static_identity) { ++ peer_for_each (wg, peer, temp, false) { ++ if (!noise_precompute_static_static(peer)) ++ peer_remove(peer); ++ } ++ cookie_checker_precompute_device_keys(&wg->cookie_checker); + } + -+ if (modified_static_identity) -+ cookie_checker_precompute_keys(&wg->cookie_checker, NULL); -+ + for (i = 0, offset = 0, user_peer = user_device + sizeof(struct wgdevice); i < in_device.num_peers; ++i, user_peer += offset) { + ret = set_peer(wg, user_peer, &offset); + if (ret) -+ break; ++ goto out; + } -+ ++ ret = 0; +out: + mutex_unlock(&wg->device_update_lock); + memzero_explicit(&in_device.private_key, NOISE_PUBLIC_KEY_LEN); @@ -189,20 +219,6 @@ + return 0; +} + -+static int calculate_ipmasks_size(void *ctx, struct wireguard_peer *peer, union nf_inet_addr ip, u8 cidr, int family) -+{ -+ size_t *count = ctx; -+ *count += sizeof(struct wgipmask); -+ return 0; -+} -+ -+static size_t calculate_peers_size(struct wireguard_device *wg) -+{ -+ size_t len = peer_total_count(wg) * sizeof(struct wgpeer); -+ routing_table_walk_ips(&wg->peer_routing_table, &len, calculate_ipmasks_size); -+ return len; -+} -+ +static int populate_ipmask(void *ctx, union nf_inet_addr ip, u8 cidr, int family) +{ + int ret; @@ -229,10 +245,9 @@ + return ret; +} + -+static int populate_peer(struct wireguard_peer *peer, void *ctx) ++static int populate_peer(struct wireguard_peer *peer, struct data_remaining *data) +{ + int ret = 0; -+ struct data_remaining *data = ctx; + void __user *upeer = data->data; + struct wgpeer out_peer; + struct data_remaining ipmasks_data = { NULL }; @@ -243,7 +258,11 @@ + if (ret) + return ret; + ++ down_read(&peer->handshake.lock); + memcpy(out_peer.public_key, peer->handshake.remote_static, NOISE_PUBLIC_KEY_LEN); ++ memcpy(out_peer.preshared_key, peer->handshake.preshared_key, NOISE_SYMMETRIC_KEY_LEN); ++ up_read(&peer->handshake.lock); ++ + read_lock_bh(&peer->endpoint_lock); + if (peer->endpoint.addr.sa_family == AF_INET) + out_peer.endpoint.addr4 = peer->endpoint.addr4; @@ -269,9 +288,10 @@ + return ret; +} + -+int config_get_device(struct wireguard_device *wg, void __user *udevice) ++int config_get_device(struct wireguard_device *wg, void __user *user_device) +{ -+ int ret = 0; ++ int ret; ++ struct wireguard_peer *peer, *temp; + struct net_device *dev = netdev_pub(wg); + struct data_remaining peer_data = { NULL }; + struct wgdevice out_device; @@ -284,53 +304,66 @@ + + mutex_lock(&wg->device_update_lock); + -+ if (!udevice) { -+ ret = calculate_peers_size(wg); ++ if (!user_device) { ++ ret = peer_total_count(wg) * sizeof(struct wgpeer) ++ + routing_table_count_nodes(&wg->peer_routing_table) * sizeof(struct wgipmask); + goto out; + } + -+ if (copy_from_user(&in_device, udevice, sizeof(in_device))) { -+ ret = -EFAULT; ++ ret = -EFAULT; ++ if (copy_from_user(&in_device, user_device, sizeof(in_device))) + goto out; -+ } + ++ ret = -EPROTO; ++ if (in_device.version_magic != WG_API_VERSION_MAGIC) ++ goto out; ++ ++ out_device.version_magic = WG_API_VERSION_MAGIC; + out_device.port = wg->incoming_port; + out_device.fwmark = wg->fwmark; -+ strncpy(out_device.interface, dev->name, IFNAMSIZ - 1); -+ out_device.interface[IFNAMSIZ - 1] = 0; ++ memcpy(out_device.interface, dev->name, IFNAMSIZ); + + down_read(&wg->static_identity.lock); + if (wg->static_identity.has_identity) { + memcpy(out_device.private_key, wg->static_identity.static_private, WG_KEY_LEN); + memcpy(out_device.public_key, wg->static_identity.static_public, WG_KEY_LEN); -+ memcpy(out_device.preshared_key, wg->static_identity.preshared_key, WG_KEY_LEN); + } + up_read(&wg->static_identity.lock); + + peer_data.out_len = in_device.peers_size; -+ peer_data.data = udevice + sizeof(struct wgdevice); -+ ret = peer_for_each_unlocked(wg, populate_peer, &peer_data); ++ peer_data.data = user_device + sizeof(struct wgdevice); ++ ++ ret = 0; ++ peer_for_each (wg, peer, temp, false) { ++ ret = populate_peer(peer, &peer_data); ++ if (ret) ++ break; ++ } + if (ret) + goto out; + out_device.num_peers = peer_data.count; + -+ if (copy_to_user(udevice, &out_device, sizeof(out_device))) -+ ret = -EFAULT; ++ ret = -EFAULT; ++ if (copy_to_user(user_device, &out_device, sizeof(out_device))) ++ goto out; ++ ++ ret = 0; + +out: + mutex_unlock(&wg->device_update_lock); + memzero_explicit(&out_device.private_key, NOISE_PUBLIC_KEY_LEN); + return ret; +} ---- /dev/null -+++ b/net/wireguard/cookie.c -@@ -0,0 +1,215 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/cookie.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,192 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#include "cookie.h" +#include "peer.h" +#include "device.h" +#include "messages.h" ++#include "ratelimiter.h" +#include "crypto/blake2s.h" +#include "crypto/chacha20poly1305.h" + @@ -338,46 +371,45 @@ +#include +#include + -+int cookie_checker_init(struct cookie_checker *checker, struct wireguard_device *wg) ++void cookie_checker_init(struct cookie_checker *checker, struct wireguard_device *wg) +{ -+ int ret = ratelimiter_init(&checker->ratelimiter, wg); -+ if (ret) -+ return ret; + init_rwsem(&checker->secret_lock); + checker->secret_birthdate = get_jiffies_64(); + get_random_bytes(checker->secret, NOISE_HASH_LEN); + checker->device = wg; -+ return 0; +} + -+static int precompute_peer_key(struct wireguard_peer *peer, void *psk) ++enum { COOKIE_KEY_LABEL_LEN = 8 }; ++static const u8 mac1_key_label[COOKIE_KEY_LABEL_LEN] = "mac1----"; ++static const u8 cookie_key_label[COOKIE_KEY_LABEL_LEN] = "cookie--"; ++ ++static void precompute_key(u8 key[NOISE_SYMMETRIC_KEY_LEN], const u8 pubkey[NOISE_PUBLIC_KEY_LEN], const u8 label[COOKIE_KEY_LABEL_LEN]) +{ -+ blake2s(peer->latest_cookie.cookie_decryption_key, peer->handshake.remote_static, psk, NOISE_SYMMETRIC_KEY_LEN, NOISE_PUBLIC_KEY_LEN, psk ? NOISE_SYMMETRIC_KEY_LEN : 0); -+ return 0; ++ struct blake2s_state blake; ++ blake2s_init(&blake, NOISE_SYMMETRIC_KEY_LEN); ++ blake2s_update(&blake, label, COOKIE_KEY_LABEL_LEN); ++ blake2s_update(&blake, pubkey, NOISE_PUBLIC_KEY_LEN); ++ blake2s_final(&blake, key, NOISE_SYMMETRIC_KEY_LEN); +} + -+void cookie_checker_precompute_keys(struct cookie_checker *checker, struct wireguard_peer *peer) ++void cookie_checker_precompute_device_keys(struct cookie_checker *checker) +{ + down_read(&checker->device->static_identity.lock); -+ if (unlikely(checker->device->static_identity.has_identity)) { -+ memset(checker->cookie_encryption_key, 0, NOISE_SYMMETRIC_KEY_LEN); -+ goto out; ++ if (likely(checker->device->static_identity.has_identity)) { ++ precompute_key(checker->cookie_encryption_key, checker->device->static_identity.static_public, cookie_key_label); ++ precompute_key(checker->message_mac1_key, checker->device->static_identity.static_public, mac1_key_label); + } -+ -+ if (peer) -+ precompute_peer_key(peer, checker->device->static_identity.has_psk ? checker->device->static_identity.preshared_key : NULL); + else { -+ blake2s(checker->cookie_encryption_key, checker->device->static_identity.static_public, checker->device->static_identity.preshared_key, NOISE_SYMMETRIC_KEY_LEN, NOISE_PUBLIC_KEY_LEN, checker->device->static_identity.has_psk ? NOISE_SYMMETRIC_KEY_LEN : 0); -+ peer_for_each_unlocked(checker->device, precompute_peer_key, checker->device->static_identity.has_psk ? checker->device->static_identity.preshared_key : NULL); ++ memset(checker->cookie_encryption_key, 0, NOISE_SYMMETRIC_KEY_LEN); ++ memset(checker->message_mac1_key, 0, NOISE_SYMMETRIC_KEY_LEN); + } -+ -+out: + up_read(&checker->device->static_identity.lock); +} + -+void cookie_checker_uninit(struct cookie_checker *checker) ++void cookie_checker_precompute_peer_keys(struct wireguard_peer *peer) +{ -+ ratelimiter_uninit(&checker->ratelimiter); ++ precompute_key(peer->latest_cookie.cookie_decryption_key, peer->handshake.remote_static, cookie_key_label); ++ precompute_key(peer->latest_cookie.message_mac1_key, peer->handshake.remote_static, mac1_key_label); +} + +void cookie_init(struct cookie *cookie) @@ -386,18 +418,10 @@ + init_rwsem(&cookie->lock); +} + -+static void compute_mac1(u8 mac1[COOKIE_LEN], const void *message, size_t len, const u8 pubkey[NOISE_PUBLIC_KEY_LEN], const u8 psk[NOISE_SYMMETRIC_KEY_LEN]) ++static void compute_mac1(u8 mac1[COOKIE_LEN], const void *message, size_t len, const u8 key[NOISE_SYMMETRIC_KEY_LEN]) +{ -+ struct blake2s_state state; + len = len - sizeof(struct message_macs) + offsetof(struct message_macs, mac1); -+ -+ if (psk) -+ blake2s_init_key(&state, COOKIE_LEN, psk, NOISE_SYMMETRIC_KEY_LEN); -+ else -+ blake2s_init(&state, COOKIE_LEN); -+ blake2s_update(&state, pubkey, NOISE_PUBLIC_KEY_LEN); -+ blake2s_update(&state, message, len); -+ blake2s_final(&state, mac1, COOKIE_LEN); ++ blake2s(mac1, message, key, COOKIE_LEN, len, NOISE_SYMMETRIC_KEY_LEN); +} + +static void compute_mac2(u8 mac2[COOKIE_LEN], const void *message, size_t len, const u8 cookie[COOKIE_LEN]) @@ -420,9 +444,9 @@ + down_read(&checker->secret_lock); + + blake2s_init_key(&state, COOKIE_LEN, checker->secret, NOISE_HASH_LEN); -+ if (ip_hdr(skb)->version == 4) ++ if (skb->protocol == htons(ETH_P_IP)) + blake2s_update(&state, (u8 *)&ip_hdr(skb)->saddr, sizeof(struct in_addr)); -+ else if (ip_hdr(skb)->version == 6) ++ else if (skb->protocol == htons(ETH_P_IPV6)) + blake2s_update(&state, (u8 *)&ipv6_hdr(skb)->saddr, sizeof(struct in6_addr)); + blake2s_update(&state, (u8 *)&udp_hdr(skb)->source, sizeof(__be16)); + blake2s_final(&state, cookie, COOKIE_LEN); @@ -430,21 +454,15 @@ + up_read(&checker->secret_lock); +} + -+enum cookie_mac_state cookie_validate_packet(struct cookie_checker *checker, struct sk_buff *skb, void *data_start, size_t data_len, bool check_cookie) ++enum cookie_mac_state cookie_validate_packet(struct cookie_checker *checker, struct sk_buff *skb, bool check_cookie) +{ + u8 computed_mac[COOKIE_LEN]; + u8 cookie[COOKIE_LEN]; + enum cookie_mac_state ret; -+ struct message_macs *macs = (struct message_macs *)((u8 *)data_start + data_len - sizeof(struct message_macs)); ++ struct message_macs *macs = (struct message_macs *)(skb->data + skb->len - sizeof(struct message_macs)); + + ret = INVALID_MAC; -+ down_read(&checker->device->static_identity.lock); -+ if (unlikely(!checker->device->static_identity.has_identity)) { -+ up_read(&checker->device->static_identity.lock); -+ goto out; -+ } -+ compute_mac1(computed_mac, data_start, data_len, checker->device->static_identity.static_public, checker->device->static_identity.has_psk ? checker->device->static_identity.preshared_key : NULL); -+ up_read(&checker->device->static_identity.lock); ++ compute_mac1(computed_mac, skb->data, skb->len, checker->message_mac1_key); + if (crypto_memneq(computed_mac, macs->mac1, COOKIE_LEN)) + goto out; + @@ -455,12 +473,12 @@ + + make_cookie(cookie, skb, checker); + -+ compute_mac2(computed_mac, data_start, data_len, cookie); ++ compute_mac2(computed_mac, skb->data, skb->len, cookie); + if (crypto_memneq(computed_mac, macs->mac2, COOKIE_LEN)) + goto out; + + ret = VALID_MAC_WITH_COOKIE_BUT_RATELIMITED; -+ if (!ratelimiter_allow(&checker->ratelimiter, skb)) ++ if (!ratelimiter_allow(skb, dev_net(netdev_pub(checker->device)))) + goto out; + + ret = VALID_MAC_WITH_COOKIE; @@ -473,16 +491,8 @@ +{ + struct message_macs *macs = (struct message_macs *)((u8 *)message + len - sizeof(struct message_macs)); + -+ down_read(&peer->device->static_identity.lock); -+ if (unlikely(!peer->device->static_identity.has_identity)) { -+ memset(macs, 0, sizeof(struct message_macs)); -+ up_read(&peer->device->static_identity.lock); -+ return; -+ } -+ compute_mac1(macs->mac1, message, len, peer->handshake.remote_static, peer->device->static_identity.has_psk ? peer->device->static_identity.preshared_key : NULL); -+ up_read(&peer->device->static_identity.lock); -+ + down_write(&peer->latest_cookie.lock); ++ compute_mac1(macs->mac1, message, len, peer->latest_cookie.message_mac1_key); + memcpy(peer->latest_cookie.last_mac1_sent, macs->mac1, COOKIE_LEN); + peer->latest_cookie.have_sent_mac1 = true; + up_write(&peer->latest_cookie.lock); @@ -495,15 +505,14 @@ + up_read(&peer->latest_cookie.lock); +} + -+void cookie_message_create(struct message_handshake_cookie *dst, struct sk_buff *skb, void *data_start, size_t data_len, __le32 index, struct cookie_checker *checker) ++void cookie_message_create(struct message_handshake_cookie *dst, struct sk_buff *skb, __le32 index, struct cookie_checker *checker) +{ -+ struct message_macs *macs = (struct message_macs *)((u8 *)data_start + data_len - sizeof(struct message_macs)); ++ struct message_macs *macs = (struct message_macs *)((u8 *)skb->data + skb->len - sizeof(struct message_macs)); + u8 cookie[COOKIE_LEN]; + + dst->header.type = cpu_to_le32(MESSAGE_HANDSHAKE_COOKIE); + dst->receiver_index = index; -+ get_random_bytes(dst->nonce, COOKIE_NONCE_LEN); -+ blake2s(dst->nonce, dst->nonce, NULL, COOKIE_NONCE_LEN, COOKIE_NONCE_LEN, 0); /* Avoid directly transmitting RNG output. */ ++ get_random_bytes_wait(dst->nonce, COOKIE_NONCE_LEN); + + make_cookie(cookie, skb, checker); + xchacha20poly1305_encrypt(dst->encrypted_cookie, cookie, COOKIE_LEN, macs->mac1, COOKIE_LEN, dst->nonce, checker->cookie_encryption_key); @@ -516,7 +525,7 @@ + bool ret; + + entry = index_hashtable_lookup(&wg->index_hashtable, INDEX_HASHTABLE_HANDSHAKE | INDEX_HASHTABLE_KEYPAIR, src->receiver_index); -+ if (!unlikely(entry)) ++ if (unlikely(!entry)) + return; + + down_read(&entry->peer->latest_cookie.lock); @@ -535,14 +544,14 @@ + entry->peer->latest_cookie.have_sent_mac1 = false; + up_write(&entry->peer->latest_cookie.lock); + } else -+ net_dbg_ratelimited("Could not decrypt invalid cookie response\n"); ++ net_dbg_ratelimited("%s: Could not decrypt invalid cookie response\n", netdev_pub(wg)->name); + +out: + peer_put(entry->peer); +} ---- /dev/null -+++ b/net/wireguard/data.c -@@ -0,0 +1,493 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/data.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,433 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#include "noise.h" @@ -560,40 +569,26 @@ +#include +#include + -+struct encryption_skb_cb { -+ u8 ds; -+ u8 num_frags; -+ unsigned int plaintext_len, trailer_len; -+ struct sk_buff *trailer; -+ u64 nonce; -+}; -+ +struct encryption_ctx { + struct padata_priv padata; + struct sk_buff_head queue; -+ packet_create_data_callback_t callback; + struct wireguard_peer *peer; + struct noise_keypair *keypair; +}; + +struct decryption_ctx { + struct padata_priv padata; -+ struct sk_buff *skb; -+ packet_consume_data_callback_t callback; -+ struct noise_keypair *keypair; + struct endpoint endpoint; -+ u64 nonce; -+ int ret; -+ u8 num_frags; ++ struct sk_buff *skb; ++ struct noise_keypair *keypair; +}; + +#ifdef CONFIG_WIREGUARD_PARALLEL +static struct kmem_cache *encryption_ctx_cache __read_mostly; +static struct kmem_cache *decryption_ctx_cache __read_mostly; + -+int packet_init_data_caches(void) ++int __init packet_init_data_caches(void) +{ -+ BUILD_BUG_ON(sizeof(struct encryption_skb_cb) > sizeof(((struct sk_buff *)0)->cb)); + encryption_ctx_cache = kmem_cache_create("wireguard_encryption_ctx", sizeof(struct encryption_ctx), 0, 0, NULL); + if (!encryption_ctx_cache) + return -ENOMEM; @@ -627,10 +622,10 @@ + if (unlikely((COUNTER_WINDOW_SIZE + their_counter) < counter->receive.counter)) + goto out; + -+ index = their_counter >> ilog2(COUNTER_REDUNDANT_BITS); ++ index = their_counter >> ilog2(BITS_PER_LONG); + + if (likely(their_counter > counter->receive.counter)) { -+ index_current = counter->receive.counter >> ilog2(COUNTER_REDUNDANT_BITS); ++ index_current = counter->receive.counter >> ilog2(BITS_PER_LONG); + top = min_t(unsigned long, index - index_current, COUNTER_BITS_TOTAL / BITS_PER_LONG); + for (i = 1; i <= top; ++i) + counter->receive.backtrack[(i + index_current) & ((COUNTER_BITS_TOTAL / BITS_PER_LONG) - 1)] = 0; @@ -638,7 +633,7 @@ + } + + index &= (COUNTER_BITS_TOTAL / BITS_PER_LONG) - 1; -+ ret = !test_and_set_bit(their_counter & (COUNTER_REDUNDANT_BITS - 1), &counter->receive.backtrack[index]); ++ ret = !test_and_set_bit(their_counter & (BITS_PER_LONG - 1), &counter->receive.backtrack[index]); + +out: + spin_unlock_bh(&counter->receive.lock); @@ -675,13 +670,36 @@ + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_probe_transport_header(skb, 0); ++ skb_reset_inner_headers(skb); +} + -+static inline void skb_encrypt(struct sk_buff *skb, struct noise_keypair *keypair, bool have_simd) ++static inline bool skb_encrypt(struct sk_buff *skb, struct noise_keypair *keypair, bool have_simd) +{ -+ struct encryption_skb_cb *cb = (struct encryption_skb_cb *)skb->cb; -+ struct scatterlist sg[cb->num_frags]; /* This should be bound to at most 128 by the caller. */ ++ struct scatterlist sg[MAX_SKB_FRAGS * 2 + 1]; + struct message_data *header; ++ unsigned int padding_len, plaintext_len, trailer_len; ++ int num_frags; ++ struct sk_buff *trailer; ++ ++ /* Store the ds bit in the cb */ ++ PACKET_CB(skb)->ds = ip_tunnel_ecn_encap(0 /* No outer TOS: no leak. TODO: should we use flowi->tos as outer? */, ip_hdr(skb), skb); ++ ++ /* Calculate lengths */ ++ padding_len = skb_padding(skb); ++ trailer_len = padding_len + noise_encrypted_len(0); ++ plaintext_len = skb->len + padding_len; ++ ++ /* Expand data section to have room for padding and auth tag */ ++ num_frags = skb_cow_data(skb, trailer_len, &trailer); ++ if (unlikely(num_frags < 0 || num_frags > ARRAY_SIZE(sg))) ++ return false; ++ ++ /* Set the padding to zeros, and make sure it and the auth tag are part of the skb */ ++ memset(skb_tail_pointer(trailer), 0, padding_len); ++ ++ /* Expand head section to have room for our header and the network stack's headers. */ ++ if (unlikely(skb_cow_head(skb, DATA_PACKET_HEAD_ROOM) < 0)) ++ return false; + + /* We have to remember to add the checksum to the innerpacket, in case the receiver forwards it. */ + if (likely(!skb_checksum_setup(skb, true))) @@ -691,18 +709,21 @@ + header = (struct message_data *)skb_push(skb, sizeof(struct message_data)); + header->header.type = cpu_to_le32(MESSAGE_DATA); + header->key_idx = keypair->remote_index; -+ header->counter = cpu_to_le64(cb->nonce); -+ pskb_put(skb, cb->trailer, cb->trailer_len); ++ header->counter = cpu_to_le64(PACKET_CB(skb)->nonce); ++ pskb_put(skb, trailer, trailer_len); + + /* Now we can encrypt the scattergather segments */ -+ sg_init_table(sg, cb->num_frags); -+ skb_to_sgvec(skb, sg, sizeof(struct message_data), noise_encrypted_len(cb->plaintext_len)); -+ chacha20poly1305_encrypt_sg(sg, sg, cb->plaintext_len, NULL, 0, cb->nonce, keypair->sending.key, have_simd); ++ sg_init_table(sg, num_frags); ++ if (skb_to_sgvec(skb, sg, sizeof(struct message_data), noise_encrypted_len(plaintext_len)) <= 0) ++ return false; ++ return chacha20poly1305_encrypt_sg(sg, sg, plaintext_len, NULL, 0, PACKET_CB(skb)->nonce, keypair->sending.key, have_simd); +} + -+static inline bool skb_decrypt(struct sk_buff *skb, u8 num_frags, u64 nonce, struct noise_symmetric_key *key) ++static inline bool skb_decrypt(struct sk_buff *skb, struct noise_symmetric_key *key) +{ -+ struct scatterlist sg[num_frags]; /* This should be bound to at most 128 by the caller. */ ++ struct scatterlist sg[MAX_SKB_FRAGS * 2 + 1]; ++ struct sk_buff *trailer; ++ int num_frags; + + if (unlikely(!key)) + return false; @@ -712,13 +733,20 @@ + return false; + } + -+ sg_init_table(sg, num_frags); -+ skb_to_sgvec(skb, sg, 0, skb->len); -+ -+ if (!chacha20poly1305_decrypt_sg(sg, sg, skb->len, NULL, 0, nonce, key->key)) ++ PACKET_CB(skb)->nonce = le64_to_cpu(((struct message_data *)skb->data)->counter); ++ skb_pull(skb, sizeof(struct message_data)); ++ num_frags = skb_cow_data(skb, 0, &trailer); ++ if (unlikely(num_frags < 0 || num_frags > ARRAY_SIZE(sg))) + return false; + -+ return pskb_trim(skb, skb->len - noise_encrypted_len(0)) == 0; ++ sg_init_table(sg, num_frags); ++ if (skb_to_sgvec(skb, sg, 0, skb->len) <= 0) ++ return false; ++ ++ if (!chacha20poly1305_decrypt_sg(sg, sg, skb->len, NULL, 0, PACKET_CB(skb)->nonce, key->key)) ++ return false; ++ ++ return !pskb_trim(skb, skb->len - noise_encrypted_len(0)); +} + +static inline bool get_encryption_nonce(u64 *nonce, struct noise_symmetric_key *key) @@ -742,10 +770,14 @@ + +static inline void queue_encrypt_reset(struct sk_buff_head *queue, struct noise_keypair *keypair) +{ -+ struct sk_buff *skb; ++ struct sk_buff *skb, *tmp; + bool have_simd = chacha20poly1305_init_simd(); -+ skb_queue_walk(queue, skb) { -+ skb_encrypt(skb, keypair, have_simd); ++ skb_queue_walk_safe (queue, skb, tmp) { ++ if (unlikely(!skb_encrypt(skb, keypair, have_simd))) { ++ __skb_unlink(skb, queue); ++ kfree_skb(skb); ++ continue; ++ } + skb_reset(skb); + } + chacha20poly1305_deinit_simd(have_simd); @@ -753,32 +785,28 @@ +} + +#ifdef CONFIG_WIREGUARD_PARALLEL -+static void do_encryption(struct padata_priv *padata) ++static void begin_parallel_encryption(struct padata_priv *padata) +{ + struct encryption_ctx *ctx = container_of(padata, struct encryption_ctx, padata); -+ ++#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && defined(CONFIG_ARM) ++ local_bh_enable(); ++#endif + queue_encrypt_reset(&ctx->queue, ctx->keypair); ++#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && defined(CONFIG_ARM) ++ local_bh_disable(); ++#endif + padata_do_serial(padata); +} + -+static void finish_encryption(struct padata_priv *padata) ++static void finish_parallel_encryption(struct padata_priv *padata) +{ + struct encryption_ctx *ctx = container_of(padata, struct encryption_ctx, padata); -+ -+ ctx->callback(&ctx->queue, ctx->peer); ++ packet_create_data_done(&ctx->queue, ctx->peer); + atomic_dec(&ctx->peer->parallel_encryption_inflight); + peer_put(ctx->peer); + kmem_cache_free(encryption_ctx_cache, ctx); +} + -+static inline int start_encryption(struct padata_instance *padata, struct padata_priv *priv, int cb_cpu) -+{ -+ memset(priv, 0, sizeof(struct padata_priv)); -+ priv->parallel = do_encryption; -+ priv->serial = finish_encryption; -+ return padata_do_parallel(padata, priv, cb_cpu); -+} -+ +static inline unsigned int choose_cpu(__le32 key) +{ + unsigned int cpu_index, cpu, cb_cpu; @@ -793,48 +821,20 @@ +} +#endif + -+int packet_create_data(struct sk_buff_head *queue, struct wireguard_peer *peer, packet_create_data_callback_t callback) ++int packet_create_data(struct sk_buff_head *queue, struct wireguard_peer *peer) +{ + int ret = -ENOKEY; + struct noise_keypair *keypair; + struct sk_buff *skb; + -+ rcu_read_lock(); -+ keypair = noise_keypair_get(rcu_dereference(peer->keypairs.current_keypair)); ++ rcu_read_lock_bh(); ++ keypair = noise_keypair_get(rcu_dereference_bh(peer->keypairs.current_keypair)); + if (unlikely(!keypair)) + goto err_rcu; -+ rcu_read_unlock(); ++ rcu_read_unlock_bh(); + -+ skb_queue_walk(queue, skb) { -+ struct encryption_skb_cb *cb = (struct encryption_skb_cb *)skb->cb; -+ unsigned int padding_len, num_frags; -+ -+ if (unlikely(!get_encryption_nonce(&cb->nonce, &keypair->sending))) -+ goto err; -+ -+ padding_len = skb_padding(skb); -+ cb->trailer_len = padding_len + noise_encrypted_len(0); -+ cb->plaintext_len = skb->len + padding_len; -+ -+ /* Store the ds bit in the cb */ -+ cb->ds = ip_tunnel_ecn_encap(0 /* No outer TOS: no leak. TODO: should we use flowi->tos as outer? */, ip_hdr(skb), skb); -+ -+ /* Expand data section to have room for padding and auth tag */ -+ ret = skb_cow_data(skb, cb->trailer_len, &cb->trailer); -+ if (unlikely(ret < 0)) -+ goto err; -+ num_frags = ret; -+ ret = -ENOMEM; -+ if (unlikely(num_frags > 128)) -+ goto err; -+ cb->num_frags = num_frags; -+ -+ /* Set the padding to zeros, and make sure it and the auth tag are part of the skb */ -+ memset(skb_tail_pointer(cb->trailer), 0, padding_len); -+ -+ /* Expand head section to have room for our header and the network stack's headers. */ -+ ret = skb_cow_head(skb, DATA_PACKET_HEAD_ROOM); -+ if (unlikely(ret < 0)) ++ skb_queue_walk (queue, skb) { ++ if (unlikely(!get_encryption_nonce(&PACKET_CB(skb)->nonce, &keypair->sending))) + goto err; + + /* After the first time through the loop, if we've suceeded with a legitimate nonce, @@ -848,21 +848,21 @@ + +#ifdef CONFIG_WIREGUARD_PARALLEL + if ((skb_queue_len(queue) > 1 || queue->next->len > 256 || atomic_read(&peer->parallel_encryption_inflight) > 0) && cpumask_weight(cpu_online_mask) > 1) { -+ unsigned int cpu = choose_cpu(keypair->remote_index); + struct encryption_ctx *ctx = kmem_cache_alloc(encryption_ctx_cache, GFP_ATOMIC); + if (!ctx) + goto serial_encrypt; + skb_queue_head_init(&ctx->queue); + skb_queue_splice_init(queue, &ctx->queue); -+ ctx->callback = callback; ++ memset(&ctx->padata, 0, sizeof(ctx->padata)); ++ ctx->padata.parallel = begin_parallel_encryption; ++ ctx->padata.serial = finish_parallel_encryption; + ctx->keypair = keypair; + ctx->peer = peer_rcu_get(peer); + ret = -EBUSY; + if (unlikely(!ctx->peer)) + goto err_parallel; + atomic_inc(&peer->parallel_encryption_inflight); -+ ret = start_encryption(peer->device->parallel_send, &ctx->padata, cpu); -+ if (unlikely(ret < 0)) { ++ if (unlikely(padata_do_parallel(peer->device->encrypt_pd, &ctx->padata, choose_cpu(keypair->remote_index)))) { + atomic_dec(&peer->parallel_encryption_inflight); + peer_put(ctx->peer); +err_parallel: @@ -875,7 +875,7 @@ +#endif + { + queue_encrypt_reset(queue, keypair); -+ callback(queue, peer); ++ packet_create_data_done(queue, peer); + } + return 0; + @@ -883,132 +883,85 @@ + noise_keypair_put(keypair); + return ret; +err_rcu: -+ rcu_read_unlock(); ++ rcu_read_unlock_bh(); + return ret; +} + +static void begin_decrypt_packet(struct decryption_ctx *ctx) +{ -+ if (unlikely(!skb_decrypt(ctx->skb, ctx->num_frags, ctx->nonce, &ctx->keypair->receiving))) -+ goto err; -+ -+ skb_reset(ctx->skb); -+ ctx->ret = 0; -+ return; -+ -+err: -+ ctx->ret = -ENOKEY; -+ peer_put(ctx->keypair->entry.peer); ++ if (unlikely(socket_endpoint_from_skb(&ctx->endpoint, ctx->skb) < 0 || !skb_decrypt(ctx->skb, &ctx->keypair->receiving))) { ++ peer_put(ctx->keypair->entry.peer); ++ noise_keypair_put(ctx->keypair); ++ dev_kfree_skb(ctx->skb); ++ ctx->skb = NULL; ++ } +} + +static void finish_decrypt_packet(struct decryption_ctx *ctx) +{ -+ struct noise_keypairs *keypairs; -+ bool used_new_key = false; -+ int ret = ctx->ret; -+ if (ret) -+ goto err; ++ bool used_new_key; + -+ keypairs = &ctx->keypair->entry.peer->keypairs; -+ ret = counter_validate(&ctx->keypair->receiving.counter, ctx->nonce) ? 0 : -ERANGE; ++ if (!ctx->skb) ++ return; + -+ if (likely(!ret)) -+ used_new_key = noise_received_with_keypair(&ctx->keypair->entry.peer->keypairs, ctx->keypair); -+ else { -+ net_dbg_ratelimited("Packet has invalid nonce %Lu (max %Lu)\n", ctx->nonce, ctx->keypair->receiving.counter.receive.counter); ++ if (unlikely(!counter_validate(&ctx->keypair->receiving.counter, PACKET_CB(ctx->skb)->nonce))) { ++ net_dbg_ratelimited("%s: Packet has invalid nonce %Lu (max %Lu)\n", netdev_pub(ctx->keypair->entry.peer->device)->name, PACKET_CB(ctx->skb)->nonce, ctx->keypair->receiving.counter.receive.counter); + peer_put(ctx->keypair->entry.peer); -+ goto err; ++ noise_keypair_put(ctx->keypair); ++ dev_kfree_skb(ctx->skb); ++ return; + } + ++ used_new_key = noise_received_with_keypair(&ctx->keypair->entry.peer->keypairs, ctx->keypair); ++ skb_reset(ctx->skb); ++ packet_consume_data_done(ctx->skb, ctx->keypair->entry.peer, &ctx->endpoint, used_new_key); + noise_keypair_put(ctx->keypair); -+ ctx->callback(ctx->skb, ctx->keypair->entry.peer, &ctx->endpoint, used_new_key, 0); -+ return; -+ -+err: -+ noise_keypair_put(ctx->keypair); -+ ctx->callback(ctx->skb, NULL, NULL, false, ret); +} + +#ifdef CONFIG_WIREGUARD_PARALLEL -+static void do_decryption(struct padata_priv *padata) ++static void begin_parallel_decryption(struct padata_priv *padata) +{ + struct decryption_ctx *ctx = container_of(padata, struct decryption_ctx, padata); ++#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && defined(CONFIG_ARM) ++ local_bh_enable(); ++#endif + begin_decrypt_packet(ctx); ++#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && defined(CONFIG_ARM) ++ local_bh_disable(); ++#endif + padata_do_serial(padata); +} + -+static void finish_decryption(struct padata_priv *padata) ++static void finish_parallel_decryption(struct padata_priv *padata) +{ + struct decryption_ctx *ctx = container_of(padata, struct decryption_ctx, padata); + finish_decrypt_packet(ctx); + kmem_cache_free(decryption_ctx_cache, ctx); +} -+ -+static inline int start_decryption(struct padata_instance *padata, struct padata_priv *priv, int cb_cpu) -+{ -+ priv->parallel = do_decryption; -+ priv->serial = finish_decryption; -+ return padata_do_parallel(padata, priv, cb_cpu); -+} +#endif + -+void packet_consume_data(struct sk_buff *skb, size_t offset, struct wireguard_device *wg, packet_consume_data_callback_t callback) ++void packet_consume_data(struct sk_buff *skb, struct wireguard_device *wg) +{ -+ int ret; -+ struct endpoint endpoint; -+ unsigned int num_frags; -+ struct sk_buff *trailer; -+ struct message_data *header; + struct noise_keypair *keypair; -+ u64 nonce; -+ __le32 idx; ++ __le32 idx = ((struct message_data *)skb->data)->key_idx; + -+ ret = socket_endpoint_from_skb(&endpoint, skb); -+ if (unlikely(ret < 0)) -+ goto err; -+ -+ ret = -ENOMEM; -+ if (unlikely(!pskb_may_pull(skb, offset + sizeof(struct message_data)))) -+ goto err; -+ -+ header = (struct message_data *)(skb->data + offset); -+ offset += sizeof(struct message_data); -+ skb_pull(skb, offset); -+ -+ idx = header->key_idx; -+ nonce = le64_to_cpu(header->counter); -+ -+ ret = skb_cow_data(skb, 0, &trailer); -+ if (unlikely(ret < 0)) -+ goto err; -+ num_frags = ret; -+ ret = -ENOMEM; -+ if (unlikely(num_frags > 128)) -+ goto err; -+ ret = -EINVAL; -+ rcu_read_lock(); ++ rcu_read_lock_bh(); + keypair = noise_keypair_get((struct noise_keypair *)index_hashtable_lookup(&wg->index_hashtable, INDEX_HASHTABLE_KEYPAIR, idx)); -+ rcu_read_unlock(); ++ rcu_read_unlock_bh(); + if (unlikely(!keypair)) + goto err; ++ +#ifdef CONFIG_WIREGUARD_PARALLEL + if (cpumask_weight(cpu_online_mask) > 1) { -+ unsigned int cpu = choose_cpu(idx); -+ struct decryption_ctx *ctx; -+ -+ ret = -ENOMEM; -+ ctx = kmem_cache_alloc(decryption_ctx_cache, GFP_ATOMIC); ++ struct decryption_ctx *ctx = kmem_cache_alloc(decryption_ctx_cache, GFP_ATOMIC); + if (unlikely(!ctx)) + goto err_peer; -+ + ctx->skb = skb; + ctx->keypair = keypair; -+ ctx->callback = callback; -+ ctx->nonce = nonce; -+ ctx->num_frags = num_frags; -+ ctx->endpoint = endpoint; -+ ret = start_decryption(wg->parallel_receive, &ctx->padata, cpu); -+ if (unlikely(ret)) { ++ memset(&ctx->padata, 0, sizeof(ctx->padata)); ++ ctx->padata.parallel = begin_parallel_decryption; ++ ctx->padata.serial = finish_parallel_decryption; ++ if (unlikely(padata_do_parallel(wg->decrypt_pd, &ctx->padata, choose_cpu(idx)))) { + kmem_cache_free(decryption_ctx_cache, ctx); + goto err_peer; + } @@ -1017,11 +970,7 @@ + { + struct decryption_ctx ctx = { + .skb = skb, -+ .keypair = keypair, -+ .callback = callback, -+ .nonce = nonce, -+ .num_frags = num_frags, -+ .endpoint = endpoint ++ .keypair = keypair + }; + begin_decrypt_packet(&ctx); + finish_decrypt_packet(&ctx); @@ -1034,11 +983,11 @@ + noise_keypair_put(keypair); +#endif +err: -+ callback(skb, NULL, NULL, false, ret); ++ dev_kfree_skb(skb); +} ---- /dev/null -+++ b/net/wireguard/device.c -@@ -0,0 +1,387 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/device.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,392 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#include "packets.h" @@ -1046,6 +995,7 @@ +#include "timers.h" +#include "device.h" +#include "config.h" ++#include "ratelimiter.h" +#include "peer.h" +#include "uapi.h" +#include "messages.h" @@ -1062,25 +1012,17 @@ +#include +#include +#include -+#if IS_ENABLED(CONFIG_NF_CONNTRACK) -+#include -+#include -+#endif + -+static int open_peer(struct wireguard_peer *peer, void *data) -+{ -+ timers_init_peer(peer); -+ packet_send_queue(peer); -+ if (peer->persistent_keepalive_interval) -+ packet_send_keepalive(peer); -+ return 0; -+} ++static LIST_HEAD(device_list); + +static int open(struct net_device *dev) +{ + int ret; ++ struct wireguard_peer *peer, *temp; + struct wireguard_device *wg = netdev_priv(dev); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0) + struct inet6_dev *dev_v6 = __in6_dev_get(dev); ++#endif + struct in_device *dev_v4 = __in_dev_get_rtnl(dev); + + if (dev_v4) { @@ -1091,102 +1033,92 @@ + IN_DEV_CONF_SET(dev_v4, SEND_REDIRECTS, false); + IPV4_DEVCONF_ALL(dev_net(dev), SEND_REDIRECTS) = false; + } ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0) + if (dev_v6) +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0) + dev_v6->addr_gen_mode = IN6_ADDR_GEN_MODE_NONE; +#else + dev_v6->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_NONE; +#endif ++#endif + + ret = socket_init(wg); + if (ret < 0) + return ret; -+ peer_for_each(wg, open_peer, NULL); -+ return 0; -+} -+ -+static int clear_noise_peer(struct wireguard_peer *peer, void *data) -+{ -+ noise_handshake_clear(&peer->handshake); -+ noise_keypairs_clear(&peer->keypairs); -+ if (peer->timers_enabled) -+ del_timer(&peer->timer_kill_ephemerals); ++ peer_for_each (wg, peer, temp, true) { ++ timers_init_peer(peer); ++ packet_send_queue(peer); ++ if (peer->persistent_keepalive_interval) ++ packet_send_keepalive(peer); ++ } + return 0; +} + +#ifdef CONFIG_PM_SLEEP +static int suspending_clear_noise_peers(struct notifier_block *nb, unsigned long action, void *data) +{ -+ struct wireguard_device *wg = container_of(nb, struct wireguard_device, clear_peers_on_suspend); -+ if (action == PM_HIBERNATION_PREPARE || action == PM_SUSPEND_PREPARE) { -+ peer_for_each(wg, clear_noise_peer, NULL); -+ rcu_barrier(); -+ } -+ return 0; -+} -+#endif ++ struct wireguard_device *wg; ++ struct wireguard_peer *peer, *temp; ++ ++ if (action != PM_HIBERNATION_PREPARE && action != PM_SUSPEND_PREPARE) ++ return 0; ++ ++ rtnl_lock(); ++ list_for_each_entry (wg, &device_list, device_list) { ++ peer_for_each (wg, peer, temp, true) { ++ noise_handshake_clear(&peer->handshake); ++ noise_keypairs_clear(&peer->keypairs); ++ if (peer->timers_enabled) ++ del_timer(&peer->timer_kill_ephemerals); ++ } ++ } ++ rtnl_unlock(); ++ rcu_barrier_bh(); + -+static int stop_peer(struct wireguard_peer *peer, void *data) -+{ -+ timers_uninit_peer(peer); -+ clear_noise_peer(peer, data); + return 0; +} ++static struct notifier_block clear_peers_on_suspend = { .notifier_call = suspending_clear_noise_peers }; ++#endif + +static int stop(struct net_device *dev) +{ + struct wireguard_device *wg = netdev_priv(dev); -+ peer_for_each(wg, stop_peer, NULL); ++ struct wireguard_peer *peer, *temp; ++ peer_for_each (wg, peer, temp, true) { ++ timers_uninit_peer(peer); ++ noise_handshake_clear(&peer->handshake); ++ noise_keypairs_clear(&peer->keypairs); ++ if (peer->timers_enabled) ++ del_timer(&peer->timer_kill_ephemerals); ++ } + skb_queue_purge(&wg->incoming_handshakes); + socket_uninit(wg); + return 0; +} + -+static void skb_unsendable(struct sk_buff *skb, struct net_device *dev) -+{ -+#if IS_ENABLED(CONFIG_NF_CONNTRACK) -+ /* This conntrack stuff is because the rate limiting needs to be applied -+ * to the original src IP, so we have to restore saddr in the IP header. -+ * It's not needed if conntracking isn't in the kernel, because in that -+ * case the saddr wouldn't be NAT-transformed anyway. */ -+ enum ip_conntrack_info ctinfo; -+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -+#endif -+ ++dev->stats.tx_errors; -+ -+ if (skb->len >= sizeof(struct iphdr) && ip_hdr(skb)->version == 4) { -+#if IS_ENABLED(CONFIG_NF_CONNTRACK) -+ if (ct) -+ ip_hdr(skb)->saddr = ct->tuplehash[0].tuple.src.u3.ip; -+#endif -+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); -+ } else if (skb->len >= sizeof(struct ipv6hdr) && ip_hdr(skb)->version == 6) { -+#if IS_ENABLED(CONFIG_NF_CONNTRACK) -+ if (ct) -+ ipv6_hdr(skb)->saddr = ct->tuplehash[0].tuple.src.u3.in6; -+#endif -+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); -+ } -+ kfree_skb(skb); -+} -+ +static netdev_tx_t xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct wireguard_device *wg = netdev_priv(dev); + struct wireguard_peer *peer; ++ struct sk_buff *next; + int ret; + + if (unlikely(dev_recursion_level() > 4)) { + ret = -ELOOP; -+ net_dbg_ratelimited("Routing loop detected\n"); -+ skb_unsendable(skb, dev); ++ net_dbg_ratelimited("%s: Routing loop detected\n", dev->name); ++ goto err; ++ } ++ ++ if (unlikely(skb_examine_untrusted_ip_hdr(skb) != skb->protocol)) { ++ ret = -EPROTONOSUPPORT; ++ net_dbg_ratelimited("%s: Invalid IP packet\n", dev->name); + goto err; + } + + peer = routing_table_lookup_dst(&wg->peer_routing_table, skb); + if (unlikely(!peer)) { + ret = -ENOKEY; -+ net_dbg_skb_ratelimited("No peer is configured for %pISc\n", skb); ++ net_dbg_skb_ratelimited("%s: No peer is configured for %pISc\n", dev->name, skb); + goto err; + } + @@ -1194,8 +1126,8 @@ + ret = peer->endpoint.addr.sa_family != AF_INET && peer->endpoint.addr.sa_family != AF_INET6; + read_unlock_bh(&peer->endpoint_lock); + if (unlikely(ret)) { -+ ret = -EHOSTUNREACH; -+ net_dbg_ratelimited("No valid endpoint has been configured or discovered for peer %Lu\n", peer->internal_id); ++ ret = -EDESTADDRREQ; ++ net_dbg_ratelimited("%s: No valid endpoint has been configured or discovered for peer %Lu\n", dev->name, peer->internal_id); + goto err_peer; + } + @@ -1215,8 +1147,8 @@ + dev_kfree_skb(skb); + skb = segs; + } -+ while (skb) { -+ struct sk_buff *next = skb->next; ++ do { ++ next = skb->next; + skb->next = skb->prev = NULL; + + skb = skb_share_check(skb, GFP_ATOMIC); @@ -1228,8 +1160,7 @@ + skb_dst_drop(skb); + + skb_queue_tail(&peer->tx_packet_queue, skb); -+ skb = next; -+ } ++ } while ((skb = next) != NULL); + + packet_send_queue(peer); + peer_put(peer); @@ -1238,11 +1169,15 @@ +err_peer: + peer_put(peer); +err: -+ skb_unsendable(skb, dev); ++ ++dev->stats.tx_errors; ++ if (skb->protocol == htons(ETH_P_IP)) ++ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); ++ else if (skb->protocol == htons(ETH_P_IPV6)) ++ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); ++ kfree_skb(skb); + return ret; +} + -+ +static int ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + struct wireguard_device *wg = netdev_priv(dev); @@ -1271,29 +1206,30 @@ +{ + struct wireguard_device *wg = netdev_priv(dev); + ++ rtnl_lock(); ++ list_del(&wg->device_list); ++ rtnl_unlock(); + mutex_lock(&wg->device_update_lock); + peer_remove_all(wg); + wg->incoming_port = 0; -+ destroy_workqueue(wg->workqueue); ++ destroy_workqueue(wg->incoming_handshake_wq); ++ destroy_workqueue(wg->peer_wq); +#ifdef CONFIG_WIREGUARD_PARALLEL -+ padata_free(wg->parallel_send); -+ padata_free(wg->parallel_receive); -+ destroy_workqueue(wg->parallelqueue); ++ padata_free(wg->encrypt_pd); ++ padata_free(wg->decrypt_pd); ++ destroy_workqueue(wg->crypt_wq); +#endif + routing_table_free(&wg->peer_routing_table); ++ ratelimiter_uninit(); + memzero_explicit(&wg->static_identity, sizeof(struct noise_static_identity)); + skb_queue_purge(&wg->incoming_handshakes); + socket_uninit(wg); -+ cookie_checker_uninit(&wg->cookie_checker); -+#ifdef CONFIG_PM_SLEEP -+ unregister_pm_notifier(&wg->clear_peers_on_suspend); -+#endif + mutex_unlock(&wg->device_update_lock); + free_percpu(dev->tstats); -+ ++ free_percpu(wg->incoming_handshakes_worker); + put_net(wg->creating_net); + -+ pr_debug("Device %s has been deleted\n", dev->name); ++ pr_debug("%s: Interface deleted\n", dev->name); + free_netdev(dev); +} + @@ -1303,7 +1239,6 @@ + enum { WG_NETDEV_FEATURES = NETIF_F_HW_CSUM | NETIF_F_RXCSUM | NETIF_F_SG | NETIF_F_GSO | NETIF_F_GSO_SOFTWARE | NETIF_F_HIGHDMA }; + + dev->netdev_ops = &netdev_ops; -+ dev->destructor = destruct; + dev->hard_header_len = 0; + dev->addr_len = 0; + dev->needed_headroom = DATA_PACKET_HEAD_ROOM; @@ -1329,7 +1264,7 @@ + +static int newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) +{ -+ int ret = -ENOMEM; ++ int ret = -ENOMEM, cpu; + struct wireguard_device *wg = netdev_priv(dev); + + wg->creating_net = get_net(src_net); @@ -1337,71 +1272,82 @@ + mutex_init(&wg->socket_update_lock); + mutex_init(&wg->device_update_lock); + skb_queue_head_init(&wg->incoming_handshakes); -+ INIT_WORK(&wg->incoming_handshakes_work, packet_process_queued_handshake_packets); + pubkey_hashtable_init(&wg->peer_hashtable); + index_hashtable_init(&wg->index_hashtable); + routing_table_init(&wg->peer_routing_table); ++ cookie_checker_init(&wg->cookie_checker, wg); + INIT_LIST_HEAD(&wg->peer_list); + + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + goto error_1; + -+ wg->workqueue = alloc_workqueue("wg-%s", WQ_UNBOUND | WQ_FREEZABLE, 0, dev->name); -+ if (!wg->workqueue) ++ wg->incoming_handshakes_worker = alloc_percpu(struct handshake_worker); ++ if (!wg->incoming_handshakes_worker) + goto error_2; ++ for_each_possible_cpu (cpu) { ++ per_cpu_ptr(wg->incoming_handshakes_worker, cpu)->wg = wg; ++ INIT_WORK(&per_cpu_ptr(wg->incoming_handshakes_worker, cpu)->work, packet_process_queued_handshake_packets); ++ } ++ atomic_set(&wg->incoming_handshake_seqnr, 0); + -+#ifdef CONFIG_WIREGUARD_PARALLEL -+ wg->parallelqueue = alloc_workqueue("wg-crypt-%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1, dev->name); -+ if (!wg->parallelqueue) ++ wg->incoming_handshake_wq = alloc_workqueue("wg-kex-%s", WQ_CPU_INTENSIVE | WQ_FREEZABLE, 0, dev->name); ++ if (!wg->incoming_handshake_wq) + goto error_3; + -+ wg->parallel_send = padata_alloc_possible(wg->parallelqueue); -+ if (!wg->parallel_send) ++ wg->peer_wq = alloc_workqueue("wg-kex-%s", WQ_UNBOUND | WQ_FREEZABLE, 0, dev->name); ++ if (!wg->peer_wq) + goto error_4; -+ padata_start(wg->parallel_send); + -+ wg->parallel_receive = padata_alloc_possible(wg->parallelqueue); -+ if (!wg->parallel_receive) ++#ifdef CONFIG_WIREGUARD_PARALLEL ++ wg->crypt_wq = alloc_workqueue("wg-crypt-%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 2, dev->name); ++ if (!wg->crypt_wq) + goto error_5; -+ padata_start(wg->parallel_receive); -+#endif + -+ ret = cookie_checker_init(&wg->cookie_checker, wg); -+ if (ret < 0) ++ wg->encrypt_pd = padata_alloc_possible(wg->crypt_wq); ++ if (!wg->encrypt_pd) + goto error_6; ++ padata_start(wg->encrypt_pd); + -+#ifdef CONFIG_PM_SLEEP -+ wg->clear_peers_on_suspend.notifier_call = suspending_clear_noise_peers; -+ ret = register_pm_notifier(&wg->clear_peers_on_suspend); -+ if (ret < 0) ++ wg->decrypt_pd = padata_alloc_possible(wg->crypt_wq); ++ if (!wg->decrypt_pd) + goto error_7; ++ padata_start(wg->decrypt_pd); +#endif + -+ ret = register_netdevice(dev); ++ ret = ratelimiter_init(); + if (ret < 0) + goto error_8; + -+ pr_debug("Device %s has been created\n", dev->name); ++ ret = register_netdevice(dev); ++ if (ret < 0) ++ goto error_9; + -+ return 0; ++ list_add(&wg->device_list, &device_list); + ++ /* We wait until the end to assign priv_destructor, so that register_netdevice doesn't ++ * call it for us if it fails. */ ++ dev->priv_destructor = destruct; ++ ++ pr_debug("%s: Interface created\n", dev->name); ++ return ret; ++ ++error_9: ++ ratelimiter_uninit(); +error_8: -+#ifdef CONFIG_PM_SLEEP -+ unregister_pm_notifier(&wg->clear_peers_on_suspend); -+error_7: -+#endif -+ cookie_checker_uninit(&wg->cookie_checker); -+error_6: +#ifdef CONFIG_WIREGUARD_PARALLEL -+ padata_free(wg->parallel_receive); ++ padata_free(wg->decrypt_pd); ++error_7: ++ padata_free(wg->encrypt_pd); ++error_6: ++ destroy_workqueue(wg->crypt_wq); +error_5: -+ padata_free(wg->parallel_send); -+error_4: -+ destroy_workqueue(wg->parallelqueue); -+error_3: +#endif -+ destroy_workqueue(wg->workqueue); ++ destroy_workqueue(wg->peer_wq); ++error_4: ++ destroy_workqueue(wg->incoming_handshake_wq); ++error_3: ++ free_percpu(wg->incoming_handshakes_worker); +error_2: + free_percpu(dev->tstats); +error_1: @@ -1416,19 +1362,27 @@ + .newlink = newlink, +}; + -+int device_init(void) ++int __init device_init(void) +{ ++#ifdef CONFIG_PM_SLEEP ++ int ret = register_pm_notifier(&clear_peers_on_suspend); ++ if (ret) ++ return ret; ++#endif + return rtnl_link_register(&link_ops); +} + -+void device_uninit(void) ++void __exit device_uninit(void) +{ + rtnl_link_unregister(&link_ops); -+ rcu_barrier(); ++#ifdef CONFIG_PM_SLEEP ++ unregister_pm_notifier(&clear_peers_on_suspend); ++#endif ++ rcu_barrier_bh(); +} ---- /dev/null -+++ b/net/wireguard/hashtables.c -@@ -0,0 +1,136 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/hashtables.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,137 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#include "hashtables.h" @@ -1467,15 +1421,15 @@ +struct wireguard_peer *pubkey_hashtable_lookup(struct pubkey_hashtable *table, const u8 pubkey[NOISE_PUBLIC_KEY_LEN]) +{ + struct wireguard_peer *iter_peer, *peer = NULL; -+ rcu_read_lock(); -+ hlist_for_each_entry_rcu(iter_peer, pubkey_bucket(table, pubkey), pubkey_hash) { ++ rcu_read_lock_bh(); ++ hlist_for_each_entry_rcu_bh (iter_peer, pubkey_bucket(table, pubkey), pubkey_hash) { + if (!memcmp(pubkey, iter_peer->handshake.remote_static, NOISE_PUBLIC_KEY_LEN)) { + peer = iter_peer; + break; + } + } + peer = peer_get(peer); -+ rcu_read_unlock(); ++ rcu_read_unlock_bh(); + return peer; +} + @@ -1488,7 +1442,6 @@ + +void index_hashtable_init(struct index_hashtable *table) +{ -+ get_random_bytes(&table->key, sizeof(table->key)); + hash_init(table->hashtable); + spin_lock_init(&table->lock); +} @@ -1496,62 +1449,64 @@ +__le32 index_hashtable_insert(struct index_hashtable *table, struct index_hashtable_entry *entry) +{ + struct index_hashtable_entry *existing_entry; -+ u32 counter = get_random_int(); + -+ spin_lock(&table->lock); ++ spin_lock_bh(&table->lock); + hlist_del_init_rcu(&entry->index_hash); -+ spin_unlock(&table->lock); ++ spin_unlock_bh(&table->lock); + -+ rcu_read_lock(); ++ rcu_read_lock_bh(); + +search_unused_slot: + /* First we try to find an unused slot, randomly, while unlocked. */ -+ entry->index = (__force __le32)siphash_2u32(get_random_int(), counter++, &table->key); -+ hlist_for_each_entry_rcu(existing_entry, index_bucket(table, entry->index), index_hash) { ++ entry->index = (__force __le32)get_random_u32(); ++ hlist_for_each_entry_rcu_bh (existing_entry, index_bucket(table, entry->index), index_hash) { + if (existing_entry->index == entry->index) + goto search_unused_slot; /* If it's already in use, we continue searching. */ + } + + /* Once we've found an unused slot, we lock it, and then double-check + * that nobody else stole it from us. */ -+ spin_lock(&table->lock); -+ hlist_for_each_entry_rcu(existing_entry, index_bucket(table, entry->index), index_hash) { ++ spin_lock_bh(&table->lock); ++ hlist_for_each_entry_rcu_bh (existing_entry, index_bucket(table, entry->index), index_hash) { + if (existing_entry->index == entry->index) { -+ spin_unlock(&table->lock); ++ spin_unlock_bh(&table->lock); + goto search_unused_slot; /* If it was stolen, we start over. */ + } + } + /* Otherwise, we know we have it exclusively (since we're locked), so we insert. */ + hlist_add_head_rcu(&entry->index_hash, index_bucket(table, entry->index)); -+ spin_unlock(&table->lock); ++ spin_unlock_bh(&table->lock); + -+ rcu_read_unlock(); ++ rcu_read_unlock_bh(); + + return entry->index; +} + -+void index_hashtable_replace(struct index_hashtable *table, struct index_hashtable_entry *old, struct index_hashtable_entry *new) ++bool index_hashtable_replace(struct index_hashtable *table, struct index_hashtable_entry *old, struct index_hashtable_entry *new) +{ -+ spin_lock(&table->lock); ++ if (unlikely(hlist_unhashed(&old->index_hash))) ++ return false; ++ spin_lock_bh(&table->lock); + new->index = old->index; + hlist_replace_rcu(&old->index_hash, &new->index_hash); + INIT_HLIST_NODE(&old->index_hash); -+ spin_unlock(&table->lock); ++ spin_unlock_bh(&table->lock); ++ return true; +} + +void index_hashtable_remove(struct index_hashtable *table, struct index_hashtable_entry *entry) +{ -+ spin_lock(&table->lock); ++ spin_lock_bh(&table->lock); + hlist_del_init_rcu(&entry->index_hash); -+ spin_unlock(&table->lock); ++ spin_unlock_bh(&table->lock); +} + +/* Returns a strong reference to a entry->peer */ +struct index_hashtable_entry *index_hashtable_lookup(struct index_hashtable *table, const enum index_hashtable_type type_mask, const __le32 index) +{ + struct index_hashtable_entry *iter_entry, *entry = NULL; -+ rcu_read_lock(); -+ hlist_for_each_entry_rcu(iter_entry, index_bucket(table, index), index_hash) { ++ rcu_read_lock_bh(); ++ hlist_for_each_entry_rcu_bh (iter_entry, index_bucket(table, index), index_hash) { + if (iter_entry->index == index && (iter_entry->type & type_mask)) { + entry = iter_entry; + break; @@ -1562,18 +1517,19 @@ + if (unlikely(!entry->peer)) + entry = NULL; + } -+ rcu_read_unlock(); ++ rcu_read_unlock_bh(); + return entry; +} ---- /dev/null -+++ b/net/wireguard/main.c -@@ -0,0 +1,70 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/main.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,68 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#include "version.h" +#include "device.h" +#include "noise.h" +#include "packets.h" ++#include "ratelimiter.h" +#include "crypto/chacha20poly1305.h" +#include "crypto/blake2s.h" +#include "crypto/curve25519.h" @@ -1587,17 +1543,15 @@ +{ + int ret; + ++ chacha20poly1305_fpu_init(); ++ blake2s_fpu_init(); ++ curve25519_fpu_init(); +#ifdef DEBUG -+ if (!routing_table_selftest() || !packet_counter_selftest() || !curve25519_selftest() || !chacha20poly1305_selftest() || !blake2s_selftest()) ++ if (!routing_table_selftest() || !packet_counter_selftest() || !curve25519_selftest() || !chacha20poly1305_selftest() || !blake2s_selftest() || !ratelimiter_selftest()) + return -ENOTRECOVERABLE; +#endif -+ chacha20poly1305_init(); + noise_init(); + -+ ret = ratelimiter_module_init(); -+ if (ret < 0) -+ return ret; -+ +#ifdef CONFIG_WIREGUARD_PARALLEL + ret = packet_init_data_caches(); + if (ret < 0) @@ -1618,7 +1572,6 @@ + packet_deinit_data_caches(); +err_packet: +#endif -+ ratelimiter_module_deinit(); + return ret; +} + @@ -1628,8 +1581,7 @@ +#ifdef CONFIG_WIREGUARD_PARALLEL + packet_deinit_data_caches(); +#endif -+ ratelimiter_module_deinit(); -+ pr_debug("WireGuard has been unloaded\n"); ++ pr_debug("WireGuard unloaded\n"); +} + +module_init(mod_init); @@ -1637,10 +1589,11 @@ +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("Fast, secure, and modern VPN tunnel"); +MODULE_AUTHOR("Jason A. Donenfeld "); ++MODULE_VERSION(WIREGUARD_VERSION); +MODULE_ALIAS_RTNL_LINK(KBUILD_MODNAME); ---- /dev/null -+++ b/net/wireguard/noise.c -@@ -0,0 +1,593 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/noise.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,612 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#include "noise.h" @@ -1657,50 +1610,66 @@ +#include +#include + -+/* This implements Noise_IK: ++/* This implements Noise_IKpsk2: + * + * <- s + * ****** -+ * -> e, es, s, ss, t -+ * <- e, ee, se ++ * -> e, es, s, ss, {t} ++ * <- e, ee, se, psk, {} + */ + -+static const u8 handshake_name[33] = "Noise_IK_25519_ChaChaPoly_BLAKE2s"; -+static const u8 handshake_psk_name[36] = "NoisePSK_IK_25519_ChaChaPoly_BLAKE2s"; -+static u8 handshake_name_hash[NOISE_HASH_LEN] __read_mostly; -+static u8 handshake_psk_name_hash[NOISE_HASH_LEN] __read_mostly; -+static const u8 identifier_name[34] = "WireGuard v0 zx2c4 Jason@zx2c4.com"; ++static const u8 handshake_name[37] = "Noise_IKpsk2_25519_ChaChaPoly_BLAKE2s"; ++static const u8 identifier_name[34] = "WireGuard v1 zx2c4 Jason@zx2c4.com"; ++static u8 handshake_init_hash[NOISE_HASH_LEN] __read_mostly; ++static u8 handshake_init_chaining_key[NOISE_HASH_LEN] __read_mostly; +static atomic64_t keypair_counter = ATOMIC64_INIT(0); + -+void noise_init(void) ++void __init noise_init(void) +{ -+ blake2s(handshake_name_hash, handshake_name, NULL, NOISE_HASH_LEN, sizeof(handshake_name), 0); -+ blake2s(handshake_psk_name_hash, handshake_psk_name, NULL, NOISE_HASH_LEN, sizeof(handshake_psk_name), 0); ++ struct blake2s_state blake; ++ blake2s(handshake_init_chaining_key, handshake_name, NULL, NOISE_HASH_LEN, sizeof(handshake_name), 0); ++ blake2s_init(&blake, NOISE_HASH_LEN); ++ blake2s_update(&blake, handshake_init_chaining_key, NOISE_HASH_LEN); ++ blake2s_update(&blake, identifier_name, sizeof(identifier_name)); ++ blake2s_final(&blake, handshake_init_hash, NOISE_HASH_LEN); +} + -+void noise_handshake_init(struct noise_handshake *handshake, struct noise_static_identity *static_identity, const u8 peer_public_key[NOISE_PUBLIC_KEY_LEN], struct wireguard_peer *peer) ++bool noise_precompute_static_static(struct wireguard_peer *peer) ++{ ++ if (peer->handshake.static_identity->has_identity) ++ return curve25519(peer->handshake.precomputed_static_static, peer->handshake.static_identity->static_private, peer->handshake.remote_static); ++ memset(peer->handshake.precomputed_static_static, 0, NOISE_PUBLIC_KEY_LEN); ++ return true; ++} ++ ++bool noise_handshake_init(struct noise_handshake *handshake, struct noise_static_identity *static_identity, const u8 peer_public_key[NOISE_PUBLIC_KEY_LEN], const u8 peer_preshared_key[NOISE_SYMMETRIC_KEY_LEN], struct wireguard_peer *peer) +{ + memset(handshake, 0, sizeof(struct noise_handshake)); + init_rwsem(&handshake->lock); + handshake->entry.type = INDEX_HASHTABLE_HANDSHAKE; + handshake->entry.peer = peer; + memcpy(handshake->remote_static, peer_public_key, NOISE_PUBLIC_KEY_LEN); ++ memcpy(handshake->preshared_key, peer_preshared_key, NOISE_SYMMETRIC_KEY_LEN); + handshake->static_identity = static_identity; + handshake->state = HANDSHAKE_ZEROED; ++ return noise_precompute_static_static(peer); ++} ++ ++static void handshake_zero(struct noise_handshake *handshake) ++{ ++ memset(&handshake->ephemeral_private, 0, NOISE_PUBLIC_KEY_LEN); ++ memset(&handshake->remote_ephemeral, 0, NOISE_PUBLIC_KEY_LEN); ++ memset(&handshake->hash, 0, NOISE_HASH_LEN); ++ memset(&handshake->chaining_key, 0, NOISE_HASH_LEN); ++ handshake->remote_index = 0; ++ handshake->state = HANDSHAKE_ZEROED; +} + +void noise_handshake_clear(struct noise_handshake *handshake) +{ + index_hashtable_remove(&handshake->entry.peer->device->index_hashtable, &handshake->entry); + down_write(&handshake->lock); -+ memset(&handshake->ephemeral_public, 0, NOISE_PUBLIC_KEY_LEN); -+ memset(&handshake->ephemeral_private, 0, NOISE_PUBLIC_KEY_LEN); -+ memset(&handshake->remote_ephemeral, 0, NOISE_PUBLIC_KEY_LEN); -+ memset(&handshake->hash, 0, NOISE_HASH_LEN); -+ memset(&handshake->chaining_key, 0, NOISE_HASH_LEN); -+ memset(&handshake->key, 0, NOISE_SYMMETRIC_KEY_LEN); -+ handshake->remote_index = 0; -+ handshake->state = HANDSHAKE_ZEROED; ++ handshake_zero(handshake); + up_write(&handshake->lock); + index_hashtable_remove(&handshake->entry.peer->device->index_hashtable, &handshake->entry); +} @@ -1720,7 +1689,7 @@ +static void keypair_free_rcu(struct rcu_head *rcu) +{ + struct noise_keypair *keypair = container_of(rcu, struct noise_keypair, rcu); -+ net_dbg_ratelimited("Keypair %Lu destroyed for peer %Lu\n", keypair->internal_id, keypair->entry.peer->internal_id); ++ net_dbg_ratelimited("%s: Keypair %Lu destroyed for peer %Lu\n", netdev_pub(keypair->entry.peer->device)->name, keypair->internal_id, keypair->entry.peer->internal_id); + kzfree(keypair); +} + @@ -1728,7 +1697,7 @@ +{ + struct noise_keypair *keypair = container_of(kref, struct noise_keypair, refcount); + index_hashtable_remove(&keypair->entry.peer->device->index_hashtable, &keypair->entry); -+ call_rcu(&keypair->rcu, keypair_free_rcu); ++ call_rcu_bh(&keypair->rcu, keypair_free_rcu); +} + +void noise_keypair_put(struct noise_keypair *keypair) @@ -1740,7 +1709,7 @@ + +struct noise_keypair *noise_keypair_get(struct noise_keypair *keypair) +{ -+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "Calling noise_keypair_get without holding the RCU read lock."); ++ RCU_LOCKDEP_WARN(!rcu_read_lock_bh_held(), "Calling noise_keypair_get without holding the RCU BH read lock"); + if (unlikely(!keypair || !kref_get_unless_zero(&keypair->refcount))) + return NULL; + return keypair; @@ -1810,19 +1779,19 @@ + + /* TODO: probably this needs the actual mutex, but we're in atomic context, + * so we can't take it here. Instead we just rely on RCU for the lookups. */ -+ rcu_read_lock(); -+ if (unlikely(received_keypair == rcu_dereference(keypairs->next_keypair))) { ++ rcu_read_lock_bh(); ++ if (unlikely(received_keypair == rcu_dereference_bh(keypairs->next_keypair))) { + ret = true; + /* When we've finally received the confirmation, we slide the next + * into the current, the current into the previous, and get rid of + * the old previous. */ -+ old_keypair = rcu_dereference(keypairs->previous_keypair); -+ rcu_assign_pointer(keypairs->previous_keypair, rcu_dereference(keypairs->current_keypair)); ++ old_keypair = rcu_dereference_bh(keypairs->previous_keypair); ++ rcu_assign_pointer(keypairs->previous_keypair, rcu_dereference_bh(keypairs->current_keypair)); + noise_keypair_put(old_keypair); + rcu_assign_pointer(keypairs->current_keypair, received_keypair); + rcu_assign_pointer(keypairs->next_keypair, NULL); + } -+ rcu_read_unlock(); ++ rcu_read_unlock_bh(); + + return ret; +} @@ -1832,8 +1801,7 @@ + down_write(&static_identity->lock); + if (private_key) { + memcpy(static_identity->static_private, private_key, NOISE_PUBLIC_KEY_LEN); -+ curve25519_generate_public(static_identity->static_public, private_key); -+ static_identity->has_identity = true; ++ static_identity->has_identity = curve25519_generate_public(static_identity->static_public, private_key); + } else { + memset(static_identity->static_private, 0, NOISE_PUBLIC_KEY_LEN); + memset(static_identity->static_public, 0, NOISE_PUBLIC_KEY_LEN); @@ -1842,44 +1810,44 @@ + up_write(&static_identity->lock); +} + -+void noise_set_static_identity_preshared_key(struct noise_static_identity *static_identity, const u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN]) -+{ -+ down_write(&static_identity->lock); -+ if (preshared_key) { -+ memcpy(static_identity->preshared_key, preshared_key, NOISE_SYMMETRIC_KEY_LEN); -+ static_identity->has_psk = true; -+ } else { -+ memset(static_identity->preshared_key, 0, NOISE_SYMMETRIC_KEY_LEN); -+ static_identity->has_psk = false; -+ } -+ up_write(&static_identity->lock); -+} -+ +/* This is Hugo Krawczyk's HKDF: + * - https://eprint.iacr.org/2010/264.pdf + * - https://tools.ietf.org/html/rfc5869 + */ -+static void kdf(u8 *first_dst, u8 *second_dst, const u8 *data, -+ size_t first_len, size_t second_len, size_t data_len, -+ const u8 chaining_key[NOISE_HASH_LEN]) ++static void kdf(u8 *first_dst, u8 *second_dst, u8 *third_dst, const u8 *data, size_t first_len, size_t second_len, size_t third_len, size_t data_len, const u8 chaining_key[NOISE_HASH_LEN]) +{ + u8 secret[BLAKE2S_OUTBYTES]; + u8 output[BLAKE2S_OUTBYTES + 1]; -+ BUG_ON(first_len > BLAKE2S_OUTBYTES || second_len > BLAKE2S_OUTBYTES); ++ BUG_ON(first_len > BLAKE2S_OUTBYTES || second_len > BLAKE2S_OUTBYTES || third_len > BLAKE2S_OUTBYTES || ((second_len || second_dst || third_len || third_dst) && (!first_len || !first_dst)) || ((third_len || third_dst) && (!second_len || !second_dst))); + + /* Extract entropy from data into secret */ + blake2s_hmac(secret, data, chaining_key, BLAKE2S_OUTBYTES, data_len, NOISE_HASH_LEN); + ++ if (!first_dst || !first_len) ++ goto out; ++ + /* Expand first key: key = secret, data = 0x1 */ + output[0] = 1; + blake2s_hmac(output, output, secret, BLAKE2S_OUTBYTES, 1, BLAKE2S_OUTBYTES); + memcpy(first_dst, output, first_len); + ++ if (!second_dst || !second_len) ++ goto out; ++ + /* Expand second key: key = secret, data = first-key || 0x2 */ + output[BLAKE2S_OUTBYTES] = 2; + blake2s_hmac(output, output, secret, BLAKE2S_OUTBYTES, BLAKE2S_OUTBYTES + 1, BLAKE2S_OUTBYTES); + memcpy(second_dst, output, second_len); + ++ if (!third_dst || !third_len) ++ goto out; ++ ++ /* Expand third key: key = secret, data = second-key || 0x3 */ ++ output[BLAKE2S_OUTBYTES] = 3; ++ blake2s_hmac(output, output, secret, BLAKE2S_OUTBYTES, BLAKE2S_OUTBYTES + 1, BLAKE2S_OUTBYTES); ++ memcpy(third_dst, output, third_len); ++ ++out: + /* Clear sensitive data from stack */ + memzero_explicit(secret, BLAKE2S_OUTBYTES); + memzero_explicit(output, BLAKE2S_OUTBYTES + 1); @@ -1896,23 +1864,19 @@ + +static void derive_keys(struct noise_symmetric_key *first_dst, struct noise_symmetric_key *second_dst, const u8 chaining_key[NOISE_HASH_LEN]) +{ -+ kdf(first_dst->key, second_dst->key, NULL, NOISE_SYMMETRIC_KEY_LEN, NOISE_SYMMETRIC_KEY_LEN, 0, chaining_key); ++ kdf(first_dst->key, second_dst->key, NULL, NULL, NOISE_SYMMETRIC_KEY_LEN, NOISE_SYMMETRIC_KEY_LEN, 0, 0, chaining_key); + symmetric_key_init(first_dst); + symmetric_key_init(second_dst); +} + -+static void mix_key(u8 key[NOISE_SYMMETRIC_KEY_LEN], u8 chaining_key[NOISE_HASH_LEN], const u8 *src, size_t src_len) -+{ -+ kdf(chaining_key, key, src, NOISE_HASH_LEN, NOISE_SYMMETRIC_KEY_LEN, src_len, chaining_key); -+} -+ -+static void mix_dh(u8 key[NOISE_SYMMETRIC_KEY_LEN], u8 chaining_key[NOISE_HASH_LEN], -+ const u8 private[NOISE_PUBLIC_KEY_LEN], const u8 public[NOISE_PUBLIC_KEY_LEN]) ++static bool __must_check mix_dh(u8 chaining_key[NOISE_HASH_LEN], u8 key[NOISE_SYMMETRIC_KEY_LEN], const u8 private[NOISE_PUBLIC_KEY_LEN], const u8 public[NOISE_PUBLIC_KEY_LEN]) +{ + u8 dh_calculation[NOISE_PUBLIC_KEY_LEN]; -+ curve25519(dh_calculation, private, public); -+ mix_key(key, chaining_key, dh_calculation, NOISE_PUBLIC_KEY_LEN); ++ if (unlikely(!curve25519(dh_calculation, private, public))) ++ return false; ++ kdf(chaining_key, key, NULL, dh_calculation, NOISE_HASH_LEN, NOISE_SYMMETRIC_KEY_LEN, 0, NOISE_PUBLIC_KEY_LEN, chaining_key); + memzero_explicit(dh_calculation, NOISE_PUBLIC_KEY_LEN); ++ return true; +} + +static void mix_hash(u8 hash[NOISE_HASH_LEN], const u8 *src, size_t src_len) @@ -1924,31 +1888,28 @@ + blake2s_final(&blake, hash, NOISE_HASH_LEN); +} + -+static void handshake_init(u8 key[NOISE_SYMMETRIC_KEY_LEN], u8 chaining_key[NOISE_HASH_LEN], u8 hash[NOISE_HASH_LEN], -+ const u8 remote_static[NOISE_PUBLIC_KEY_LEN], const u8 psk[NOISE_SYMMETRIC_KEY_LEN]) ++static void mix_psk(u8 chaining_key[NOISE_HASH_LEN], u8 hash[NOISE_HASH_LEN], u8 key[NOISE_SYMMETRIC_KEY_LEN], const u8 psk[NOISE_SYMMETRIC_KEY_LEN]) +{ -+ memset(key, 0, NOISE_SYMMETRIC_KEY_LEN); -+ memcpy(hash, psk ? handshake_psk_name_hash : handshake_name_hash, NOISE_HASH_LEN); -+ mix_hash(hash, identifier_name, sizeof(identifier_name)); -+ if (psk) { -+ u8 temp_hash[NOISE_HASH_LEN]; -+ kdf(chaining_key, temp_hash, psk, NOISE_HASH_LEN, NOISE_HASH_LEN, NOISE_SYMMETRIC_KEY_LEN, handshake_psk_name_hash); -+ mix_hash(hash, temp_hash, NOISE_HASH_LEN); -+ memzero_explicit(temp_hash, NOISE_HASH_LEN); -+ } else -+ memcpy(chaining_key, handshake_name_hash, NOISE_HASH_LEN); ++ u8 temp_hash[NOISE_HASH_LEN]; ++ kdf(chaining_key, temp_hash, key, psk, NOISE_HASH_LEN, NOISE_HASH_LEN, NOISE_SYMMETRIC_KEY_LEN, NOISE_SYMMETRIC_KEY_LEN, chaining_key); ++ mix_hash(hash, temp_hash, NOISE_HASH_LEN); ++ memzero_explicit(temp_hash, NOISE_HASH_LEN); ++} ++ ++static void handshake_init(u8 chaining_key[NOISE_HASH_LEN], u8 hash[NOISE_HASH_LEN], const u8 remote_static[NOISE_PUBLIC_KEY_LEN]) ++{ ++ memcpy(hash, handshake_init_hash, NOISE_HASH_LEN); ++ memcpy(chaining_key, handshake_init_chaining_key, NOISE_HASH_LEN); + mix_hash(hash, remote_static, NOISE_PUBLIC_KEY_LEN); +} + -+static bool handshake_encrypt(u8 *dst_ciphertext, const u8 *src_plaintext, size_t src_len, u8 key[NOISE_SYMMETRIC_KEY_LEN], u8 hash[NOISE_HASH_LEN]) ++static void message_encrypt(u8 *dst_ciphertext, const u8 *src_plaintext, size_t src_len, u8 key[NOISE_SYMMETRIC_KEY_LEN], u8 hash[NOISE_HASH_LEN]) +{ -+ if (!chacha20poly1305_encrypt(dst_ciphertext, src_plaintext, src_len, hash, NOISE_HASH_LEN, 0 /* Always zero for Noise_IK */, key)) -+ return false; ++ chacha20poly1305_encrypt(dst_ciphertext, src_plaintext, src_len, hash, NOISE_HASH_LEN, 0 /* Always zero for Noise_IK */, key); + mix_hash(hash, dst_ciphertext, noise_encrypted_len(src_len)); -+ return true; +} + -+static bool handshake_decrypt(u8 *dst_plaintext, const u8 *src_ciphertext, size_t src_len, u8 key[NOISE_SYMMETRIC_KEY_LEN], u8 hash[NOISE_HASH_LEN]) ++static bool message_decrypt(u8 *dst_plaintext, const u8 *src_ciphertext, size_t src_len, u8 key[NOISE_SYMMETRIC_KEY_LEN], u8 hash[NOISE_HASH_LEN]) +{ + if (!chacha20poly1305_decrypt(dst_plaintext, src_ciphertext, src_len, hash, NOISE_HASH_LEN, 0 /* Always zero for Noise_IK */, key)) + return false; @@ -1956,10 +1917,12 @@ + return true; +} + -+static void handshake_nocrypt(u8 *dst, const u8 *src, size_t src_len, u8 hash[NOISE_HASH_LEN]) ++static void message_ephemeral(u8 ephemeral_dst[NOISE_PUBLIC_KEY_LEN], const u8 ephemeral_src[NOISE_PUBLIC_KEY_LEN], u8 chaining_key[NOISE_HASH_LEN], u8 hash[NOISE_HASH_LEN]) +{ -+ memcpy(dst, src, src_len); -+ mix_hash(hash, src, src_len); ++ if (ephemeral_dst != ephemeral_src) ++ memcpy(ephemeral_dst, ephemeral_src, NOISE_PUBLIC_KEY_LEN); ++ mix_hash(hash, ephemeral_src, NOISE_PUBLIC_KEY_LEN); ++ kdf(chaining_key, NULL, NULL, ephemeral_src, NOISE_HASH_LEN, 0, 0, NOISE_PUBLIC_KEY_LEN, chaining_key); +} + +static void tai64n_now(u8 output[NOISE_TIMESTAMP_LEN]) @@ -1974,6 +1937,7 @@ +bool noise_handshake_create_initiation(struct message_handshake_initiation *dst, struct noise_handshake *handshake) +{ + u8 timestamp[NOISE_TIMESTAMP_LEN]; ++ u8 key[NOISE_SYMMETRIC_KEY_LEN]; + bool ret = false; + + down_read(&handshake->static_identity->lock); @@ -1984,39 +1948,37 @@ + + dst->header.type = cpu_to_le32(MESSAGE_HANDSHAKE_INITIATION); + -+ handshake_init(handshake->key, handshake->chaining_key, handshake->hash, handshake->remote_static, -+ handshake->static_identity->has_psk ? handshake->static_identity->preshared_key : NULL); ++ handshake_init(handshake->chaining_key, handshake->hash, handshake->remote_static); + + /* e */ + curve25519_generate_secret(handshake->ephemeral_private); -+ curve25519_generate_public(handshake->ephemeral_public, handshake->ephemeral_private); -+ handshake_nocrypt(dst->unencrypted_ephemeral, handshake->ephemeral_public, NOISE_PUBLIC_KEY_LEN, handshake->hash); -+ if (handshake->static_identity->has_psk) -+ mix_key(handshake->key, handshake->chaining_key, handshake->ephemeral_public, NOISE_PUBLIC_KEY_LEN); ++ if (!curve25519_generate_public(dst->unencrypted_ephemeral, handshake->ephemeral_private)) ++ goto out; ++ message_ephemeral(dst->unencrypted_ephemeral, dst->unencrypted_ephemeral, handshake->chaining_key, handshake->hash); + + /* es */ -+ mix_dh(handshake->key, handshake->chaining_key, handshake->ephemeral_private, handshake->remote_static); ++ if (!mix_dh(handshake->chaining_key, key, handshake->ephemeral_private, handshake->remote_static)) ++ goto out; + + /* s */ -+ if (!handshake_encrypt(dst->encrypted_static, handshake->static_identity->static_public, NOISE_PUBLIC_KEY_LEN, handshake->key, handshake->hash)) -+ goto out; ++ message_encrypt(dst->encrypted_static, handshake->static_identity->static_public, NOISE_PUBLIC_KEY_LEN, key, handshake->hash); + + /* ss */ -+ mix_dh(handshake->key, handshake->chaining_key, handshake->static_identity->static_private, handshake->remote_static); ++ kdf(handshake->chaining_key, key, NULL, handshake->precomputed_static_static, NOISE_HASH_LEN, NOISE_SYMMETRIC_KEY_LEN, 0, NOISE_PUBLIC_KEY_LEN, handshake->chaining_key); + -+ /* t */ ++ /* {t} */ + tai64n_now(timestamp); -+ if (!handshake_encrypt(dst->encrypted_timestamp, timestamp, NOISE_TIMESTAMP_LEN, handshake->key, handshake->hash)) -+ goto out; ++ message_encrypt(dst->encrypted_timestamp, timestamp, NOISE_TIMESTAMP_LEN, key, handshake->hash); + + dst->sender_index = index_hashtable_insert(&handshake->entry.peer->device->index_hashtable, &handshake->entry); + -+ ret = true; + handshake->state = HANDSHAKE_CREATED_INITIATION; ++ ret = true; + +out: + up_write(&handshake->lock); + up_read(&handshake->static_identity->lock); ++ memzero_explicit(key, NOISE_SYMMETRIC_KEY_LEN); + return ret; +} + @@ -2036,26 +1998,17 @@ + if (unlikely(!wg->static_identity.has_identity)) + goto out; + -+ handshake_init(key, chaining_key, hash, wg->static_identity.static_public, -+ wg->static_identity.has_psk ? wg->static_identity.preshared_key : NULL); ++ handshake_init(chaining_key, hash, wg->static_identity.static_public); + + /* e */ -+ handshake_nocrypt(e, src->unencrypted_ephemeral, sizeof(src->unencrypted_ephemeral), hash); -+ if (wg->static_identity.has_psk) -+ mix_key(key, chaining_key, e, NOISE_PUBLIC_KEY_LEN); ++ message_ephemeral(e, src->unencrypted_ephemeral, chaining_key, hash); + + /* es */ -+ mix_dh(key, chaining_key, wg->static_identity.static_private, e); -+ -+ /* s */ -+ if (!handshake_decrypt(s, src->encrypted_static, sizeof(src->encrypted_static), key, hash)) ++ if (!mix_dh(chaining_key, key, wg->static_identity.static_private, e)) + goto out; + -+ /* ss */ -+ mix_dh(key, chaining_key, wg->static_identity.static_private, s); -+ -+ /* t */ -+ if (!handshake_decrypt(t, src->encrypted_timestamp, sizeof(src->encrypted_timestamp), key, hash)) ++ /* s */ ++ if (!message_decrypt(s, src->encrypted_static, sizeof(src->encrypted_static), key, hash)) + goto out; + + /* Lookup which peer we're actually talking to */ @@ -2063,6 +2016,14 @@ + if (!wg_peer) + goto out; + handshake = &wg_peer->handshake; ++ ++ /* ss */ ++ kdf(chaining_key, key, NULL, handshake->precomputed_static_static, NOISE_HASH_LEN, NOISE_SYMMETRIC_KEY_LEN, 0, NOISE_PUBLIC_KEY_LEN, chaining_key); ++ ++ /* {t} */ ++ if (!message_decrypt(t, src->encrypted_timestamp, sizeof(src->encrypted_timestamp), key, hash)) ++ goto out; ++ + down_read(&handshake->lock); + replay_attack = memcmp(t, handshake->latest_timestamp, NOISE_TIMESTAMP_LEN) <= 0; + flood_attack = !time_is_before_jiffies64(handshake->last_initiation_consumption + INITIATIONS_PER_SECOND); @@ -2077,7 +2038,6 @@ + down_write(&handshake->lock); + memcpy(handshake->remote_ephemeral, e, NOISE_PUBLIC_KEY_LEN); + memcpy(handshake->latest_timestamp, t, NOISE_TIMESTAMP_LEN); -+ memcpy(handshake->key, key, NOISE_SYMMETRIC_KEY_LEN); + memcpy(handshake->hash, hash, NOISE_HASH_LEN); + memcpy(handshake->chaining_key, chaining_key, NOISE_HASH_LEN); + handshake->remote_index = src->sender_index; @@ -2096,6 +2056,7 @@ +bool noise_handshake_create_response(struct message_handshake_response *dst, struct noise_handshake *handshake) +{ + bool ret = false; ++ u8 key[NOISE_SYMMETRIC_KEY_LEN]; + down_read(&handshake->static_identity->lock); + down_write(&handshake->lock); + @@ -2107,20 +2068,24 @@ + + /* e */ + curve25519_generate_secret(handshake->ephemeral_private); -+ curve25519_generate_public(handshake->ephemeral_public, handshake->ephemeral_private); -+ handshake_nocrypt(dst->unencrypted_ephemeral, handshake->ephemeral_public, NOISE_PUBLIC_KEY_LEN, handshake->hash); -+ if (handshake->static_identity->has_psk) -+ mix_key(handshake->key, handshake->chaining_key, handshake->ephemeral_public, NOISE_PUBLIC_KEY_LEN); ++ if (!curve25519_generate_public(dst->unencrypted_ephemeral, handshake->ephemeral_private)) ++ goto out; ++ message_ephemeral(dst->unencrypted_ephemeral, dst->unencrypted_ephemeral, handshake->chaining_key, handshake->hash); + + /* ee */ -+ mix_dh(handshake->key, handshake->chaining_key, handshake->ephemeral_private, handshake->remote_ephemeral); ++ if (!mix_dh(handshake->chaining_key, NULL, handshake->ephemeral_private, handshake->remote_ephemeral)) ++ goto out; + + /* se */ -+ mix_dh(handshake->key, handshake->chaining_key, handshake->ephemeral_private, handshake->remote_static); -+ -+ if (!handshake_encrypt(dst->encrypted_nothing, NULL, 0, handshake->key, handshake->hash)) ++ if (!mix_dh(handshake->chaining_key, NULL, handshake->ephemeral_private, handshake->remote_static)) + goto out; + ++ /* psk */ ++ mix_psk(handshake->chaining_key, handshake->hash, key, handshake->preshared_key); ++ ++ /* {} */ ++ message_encrypt(dst->encrypted_nothing, NULL, 0, key, handshake->hash); ++ + dst->sender_index = index_hashtable_insert(&handshake->entry.peer->device->index_hashtable, &handshake->entry); + + handshake->state = HANDSHAKE_CREATED_RESPONSE; @@ -2129,6 +2094,7 @@ +out: + up_write(&handshake->lock); + up_read(&handshake->static_identity->lock); ++ memzero_explicit(key, NOISE_SYMMETRIC_KEY_LEN); + return ret; +} + @@ -2155,7 +2121,6 @@ + + down_read(&handshake->lock); + state = handshake->state; -+ memcpy(key, handshake->key, NOISE_SYMMETRIC_KEY_LEN); + memcpy(hash, handshake->hash, NOISE_HASH_LEN); + memcpy(chaining_key, handshake->chaining_key, NOISE_HASH_LEN); + memcpy(ephemeral_private, handshake->ephemeral_private, NOISE_PUBLIC_KEY_LEN); @@ -2165,24 +2130,31 @@ + goto fail; + + /* e */ -+ handshake_nocrypt(e, src->unencrypted_ephemeral, sizeof(src->unencrypted_ephemeral), hash); -+ if (wg->static_identity.has_psk) -+ mix_key(key, chaining_key, e, NOISE_PUBLIC_KEY_LEN); ++ message_ephemeral(e, src->unencrypted_ephemeral, chaining_key, hash); + + /* ee */ -+ mix_dh(key, chaining_key, ephemeral_private, e); ++ if (!mix_dh(chaining_key, NULL, ephemeral_private, e)) ++ goto out; + + /* se */ -+ mix_dh(key, chaining_key, wg->static_identity.static_private, e); ++ if (!mix_dh(chaining_key, NULL, wg->static_identity.static_private, e)) ++ goto out; + -+ /* decrypt nothing */ -+ if (!handshake_decrypt(NULL, src->encrypted_nothing, sizeof(src->encrypted_nothing), key, hash)) ++ /* psk */ ++ mix_psk(chaining_key, hash, key, handshake->preshared_key); ++ ++ /* {} */ ++ if (!message_decrypt(NULL, src->encrypted_nothing, sizeof(src->encrypted_nothing), key, hash)) + goto fail; + + /* Success! Copy everything to peer */ + down_write(&handshake->lock); ++ /* It's important to check that the state is still the same, while we have an exclusive lock */ ++ if (handshake->state != state) { ++ up_write(&handshake->lock); ++ goto fail; ++ } + memcpy(handshake->remote_ephemeral, e, NOISE_PUBLIC_KEY_LEN); -+ memcpy(handshake->key, key, NOISE_SYMMETRIC_KEY_LEN); + memcpy(handshake->hash, hash, NOISE_HASH_LEN); + memcpy(handshake->chaining_key, chaining_key, NOISE_HASH_LEN); + handshake->remote_index = src->sender_index; @@ -2207,7 +2179,7 @@ +{ + struct noise_keypair *new_keypair; + -+ down_read(&handshake->lock); ++ down_write(&handshake->lock); + if (handshake->state != HANDSHAKE_CREATED_RESPONSE && handshake->state != HANDSHAKE_CONSUMED_RESPONSE) + goto fail; + @@ -2221,22 +2193,22 @@ + derive_keys(&new_keypair->sending, &new_keypair->receiving, handshake->chaining_key); + else + derive_keys(&new_keypair->receiving, &new_keypair->sending, handshake->chaining_key); -+ up_read(&handshake->lock); + ++ handshake_zero(handshake); + add_new_keypair(keypairs, new_keypair); -+ index_hashtable_replace(&handshake->entry.peer->device->index_hashtable, &handshake->entry, &new_keypair->entry); -+ noise_handshake_clear(handshake); -+ net_dbg_ratelimited("Keypair %Lu created for peer %Lu\n", new_keypair->internal_id, new_keypair->entry.peer->internal_id); ++ net_dbg_ratelimited("%s: Keypair %Lu created for peer %Lu\n", netdev_pub(new_keypair->entry.peer->device)->name, new_keypair->internal_id, new_keypair->entry.peer->internal_id); ++ WARN_ON(!index_hashtable_replace(&handshake->entry.peer->device->index_hashtable, &handshake->entry, &new_keypair->entry)); ++ up_write(&handshake->lock); + + return true; + +fail: -+ up_read(&handshake->lock); ++ up_write(&handshake->lock); + return false; +} ---- /dev/null -+++ b/net/wireguard/peer.c -@@ -0,0 +1,151 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/peer.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,130 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#include "peer.h" @@ -2253,7 +2225,7 @@ + +static atomic64_t peer_counter = ATOMIC64_INIT(0); + -+struct wireguard_peer *peer_create(struct wireguard_device *wg, const u8 public_key[NOISE_PUBLIC_KEY_LEN]) ++struct wireguard_peer *peer_create(struct wireguard_device *wg, const u8 public_key[NOISE_PUBLIC_KEY_LEN], const u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN]) +{ + struct wireguard_peer *peer; + lockdep_assert_held(&wg->device_update_lock); @@ -2273,8 +2245,11 @@ + peer->internal_id = atomic64_inc_return(&peer_counter); + peer->device = wg; + cookie_init(&peer->latest_cookie); -+ noise_handshake_init(&peer->handshake, &wg->static_identity, public_key, peer); -+ cookie_checker_precompute_keys(&wg->cookie_checker, peer); ++ if (!noise_handshake_init(&peer->handshake, &wg->static_identity, public_key, preshared_key, peer)) { ++ kfree(peer); ++ return NULL; ++ } ++ cookie_checker_precompute_peer_keys(peer); + mutex_init(&peer->keypairs.keypair_update_lock); + INIT_WORK(&peer->transmit_handshake_work, packet_send_queued_handshakes); + rwlock_init(&peer->endpoint_lock); @@ -2282,13 +2257,16 @@ + kref_init(&peer->refcount); + pubkey_hashtable_add(&wg->peer_hashtable, peer); + list_add_tail(&peer->peer_list, &wg->peer_list); -+ pr_debug("Peer %Lu created\n", peer->internal_id); ++#ifdef CONFIG_WIREGUARD_PARALLEL ++ atomic_set(&peer->parallel_encryption_inflight, 0); ++#endif ++ pr_debug("%s: Peer %Lu created\n", netdev_pub(wg)->name, peer->internal_id); + return peer; +} + +struct wireguard_peer *peer_get(struct wireguard_peer *peer) +{ -+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "Calling peer_get without holding the RCU read lock."); ++ RCU_LOCKDEP_WARN(!rcu_read_lock_bh_held(), "Calling peer_get without holding the RCU read lock"); + if (unlikely(!peer || !kref_get_unless_zero(&peer->refcount))) + return NULL; + return peer; @@ -2296,9 +2274,9 @@ + +struct wireguard_peer *peer_rcu_get(struct wireguard_peer *peer) +{ -+ rcu_read_lock(); ++ rcu_read_lock_bh(); + peer = peer_get(peer); -+ rcu_read_unlock(); ++ rcu_read_unlock_bh(); + return peer; +} + @@ -2316,8 +2294,8 @@ + timers_uninit_peer(peer); + routing_table_remove_by_peer(&peer->device->peer_routing_table, peer); + pubkey_hashtable_remove(&peer->device->peer_hashtable, peer); -+ if (peer->device->workqueue) -+ flush_workqueue(peer->device->workqueue); ++ if (peer->device->peer_wq) ++ flush_workqueue(peer->device->peer_wq); + skb_queue_purge(&peer->tx_packet_queue); + peer_put(peer); +} @@ -2325,7 +2303,7 @@ +static void rcu_release(struct rcu_head *rcu) +{ + struct wireguard_peer *peer = container_of(rcu, struct wireguard_peer, rcu); -+ pr_debug("Peer %Lu (%pISpfsc) destroyed\n", peer->internal_id, &peer->endpoint.addr); ++ pr_debug("%s: Peer %Lu (%pISpfsc) destroyed\n", netdev_pub(peer->device)->name, peer->internal_id, &peer->endpoint.addr); + skb_queue_purge(&peer->tx_packet_queue); + dst_cache_destroy(&peer->endpoint_cache); + kzfree(peer); @@ -2334,7 +2312,7 @@ +static void kref_release(struct kref *refcount) +{ + struct wireguard_peer *peer = container_of(refcount, struct wireguard_peer, refcount); -+ call_rcu(&peer->rcu, rcu_release); ++ call_rcu_bh(&peer->rcu, rcu_release); +} + +void peer_put(struct wireguard_peer *peer) @@ -2344,38 +2322,11 @@ + kref_put(&peer->refcount, kref_release); +} + -+int peer_for_each_unlocked(struct wireguard_device *wg, int (*fn)(struct wireguard_peer *peer, void *ctx), void *data) -+{ -+ struct wireguard_peer *peer, *temp; -+ int ret = 0; -+ -+ lockdep_assert_held(&wg->device_update_lock); -+ list_for_each_entry_safe(peer, temp, &wg->peer_list, peer_list) { -+ peer = peer_rcu_get(peer); -+ if (unlikely(!peer)) -+ continue; -+ ret = fn(peer, data); -+ peer_put(peer); -+ if (ret < 0) -+ break; -+ } -+ return ret; -+} -+ -+int peer_for_each(struct wireguard_device *wg, int (*fn)(struct wireguard_peer *peer, void *ctx), void *data) -+{ -+ int ret; -+ mutex_lock(&wg->device_update_lock); -+ ret = peer_for_each_unlocked(wg, fn, data); -+ mutex_unlock(&wg->device_update_lock); -+ return ret; -+} -+ +void peer_remove_all(struct wireguard_device *wg) +{ + struct wireguard_peer *peer, *temp; + lockdep_assert_held(&wg->device_update_lock); -+ list_for_each_entry_safe(peer, temp, &wg->peer_list, peer_list) ++ list_for_each_entry_safe (peer, temp, &wg->peer_list, peer_list) + peer_remove(peer); +} + @@ -2384,153 +2335,209 @@ + unsigned int i = 0; + struct wireguard_peer *peer; + lockdep_assert_held(&wg->device_update_lock); -+ list_for_each_entry(peer, &wg->peer_list, peer_list) ++ list_for_each_entry (peer, &wg->peer_list, peer_list) + ++i; + return i; +} ---- /dev/null -+++ b/net/wireguard/ratelimiter.c -@@ -0,0 +1,138 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/ratelimiter.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,194 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#include "ratelimiter.h" -+#include "peer.h" -+#include "device.h" -+ -+#include -+#include ++#include ++#include ++#include +#include + -+static struct xt_match *v4_match __read_mostly; ++static struct kmem_cache *entry_cache; ++static hsiphash_key_t key; ++static spinlock_t table_lock = __SPIN_LOCK_UNLOCKED("ratelimiter_table_lock"); ++static atomic64_t refcnt = ATOMIC64_INIT(0); ++static atomic_t total_entries = ATOMIC_INIT(0); ++static unsigned int max_entries, table_size; ++static void gc_entries(struct work_struct *); ++static DECLARE_DEFERRABLE_WORK(gc_work, gc_entries); ++static struct hlist_head *table_v4; +#if IS_ENABLED(CONFIG_IPV6) -+static struct xt_match *v6_match __read_mostly; ++static struct hlist_head *table_v6; +#endif + -+enum { -+ RATELIMITER_PACKETS_PER_SECOND = 30, -+ RATELIMITER_PACKETS_BURSTABLE = 5 ++struct entry { ++ u64 last_time_ns, tokens; ++ __be64 ip; ++ void *net; ++ spinlock_t lock; ++ struct hlist_node hash; ++ struct rcu_head rcu; +}; + -+static inline void cfg_init(struct hashlimit_cfg1 *cfg, int family) ++enum { ++ PACKETS_PER_SECOND = 20, ++ PACKETS_BURSTABLE = 5, ++ PACKET_COST = NSEC_PER_SEC / PACKETS_PER_SECOND, ++ TOKEN_MAX = PACKET_COST * PACKETS_BURSTABLE ++}; ++ ++static void entry_free(struct rcu_head *rcu) +{ -+ memset(cfg, 0, sizeof(struct hashlimit_cfg1)); -+ if (family == NFPROTO_IPV4) -+ cfg->srcmask = 32; -+ else if (family == NFPROTO_IPV6) -+ cfg->srcmask = 96; -+ cfg->mode = XT_HASHLIMIT_HASH_SIP; /* source IP only -- we could also do source port by ORing this with XT_HASHLIMIT_HASH_SPT */ -+ cfg->avg = XT_HASHLIMIT_SCALE / RATELIMITER_PACKETS_PER_SECOND; /* 30 per second per IP */ -+ cfg->burst = RATELIMITER_PACKETS_BURSTABLE; /* Allow bursts of 5 at a time */ -+ cfg->gc_interval = 1000; /* same as expiration date */ -+ cfg->expire = 1000; /* Units of avg (seconds = 1) times 1000 */ -+ /* cfg->size and cfg->max are computed based on the memory size of left to zero */ ++ kmem_cache_free(entry_cache, container_of(rcu, struct entry, rcu)); ++ atomic_dec(&total_entries); +} + -+int ratelimiter_init(struct ratelimiter *ratelimiter, struct wireguard_device *wg) ++static void entry_uninit(struct entry *entry) +{ -+ struct net_device *dev = netdev_pub(wg); -+ struct xt_mtchk_param chk = { .net = wg->creating_net }; -+ int ret; ++ hlist_del_rcu(&entry->hash); ++ call_rcu_bh(&entry->rcu, entry_free); ++} + -+ memset(ratelimiter, 0, sizeof(struct ratelimiter)); -+ -+ cfg_init(&ratelimiter->v4_info.cfg, NFPROTO_IPV4); -+ memcpy(ratelimiter->v4_info.name, dev->name, IFNAMSIZ); -+ chk.matchinfo = &ratelimiter->v4_info; -+ chk.match = v4_match; -+ chk.family = NFPROTO_IPV4; -+ ret = v4_match->checkentry(&chk); -+ if (ret < 0) -+ return ret; ++/* Calling this function with a NULL work uninits all entries. */ ++static void gc_entries(struct work_struct *work) ++{ ++ unsigned int i; ++ struct entry *entry; ++ struct hlist_node *temp; ++ const u64 now = ktime_get_ns(); + ++ for (i = 0; i < table_size; ++i) { ++ spin_lock(&table_lock); ++ hlist_for_each_entry_safe (entry, temp, &table_v4[i], hash) { ++ if (unlikely(!work) || now - entry->last_time_ns > NSEC_PER_SEC) ++ entry_uninit(entry); ++ } +#if IS_ENABLED(CONFIG_IPV6) -+ cfg_init(&ratelimiter->v6_info.cfg, NFPROTO_IPV6); -+ memcpy(ratelimiter->v6_info.name, dev->name, IFNAMSIZ); -+ chk.matchinfo = &ratelimiter->v6_info; -+ chk.match = v6_match; -+ chk.family = NFPROTO_IPV6; -+ ret = v6_match->checkentry(&chk); -+ if (ret < 0) { -+ struct xt_mtdtor_param dtor_v4 = { -+ .net = wg->creating_net, -+ .match = v4_match, -+ .matchinfo = &ratelimiter->v4_info, -+ .family = NFPROTO_IPV4 -+ }; -+ v4_match->destroy(&dtor_v4); -+ return ret; ++ hlist_for_each_entry_safe (entry, temp, &table_v6[i], hash) { ++ if (unlikely(!work) || now - entry->last_time_ns > NSEC_PER_SEC) ++ entry_uninit(entry); ++ } ++#endif ++ spin_unlock(&table_lock); ++ if (likely(work)) ++ cond_resched(); + } -+#endif -+ -+ ratelimiter->net = wg->creating_net; -+ return 0; ++ if (likely(work)) ++ queue_delayed_work(system_power_efficient_wq, &gc_work, HZ); +} + -+void ratelimiter_uninit(struct ratelimiter *ratelimiter) ++bool ratelimiter_allow(struct sk_buff *skb, struct net *net) +{ -+ struct xt_mtdtor_param dtor = { .net = ratelimiter->net }; ++ struct entry *entry; ++ struct hlist_head *bucket; ++ struct { __be64 ip; u32 net; } data = { .net = (unsigned long)net & 0xffffffff }; + -+ dtor.match = v4_match; -+ dtor.matchinfo = &ratelimiter->v4_info; -+ dtor.family = NFPROTO_IPV4; -+ v4_match->destroy(&dtor); -+ -+#if IS_ENABLED(CONFIG_IPV6) -+ dtor.match = v6_match; -+ dtor.matchinfo = &ratelimiter->v6_info; -+ dtor.family = NFPROTO_IPV6; -+ v6_match->destroy(&dtor); -+#endif -+} -+ -+bool ratelimiter_allow(struct ratelimiter *ratelimiter, struct sk_buff *skb) -+{ -+ struct xt_action_param action = { { NULL } }; -+ if (unlikely(skb->len < sizeof(struct iphdr))) -+ return false; -+ if (ip_hdr(skb)->version == 4) { -+ action.match = v4_match; -+ action.matchinfo = &ratelimiter->v4_info; -+ action.thoff = ip_hdrlen(skb); ++ if (skb->protocol == htons(ETH_P_IP)) { ++ data.ip = (__force __be64)ip_hdr(skb)->saddr; ++ bucket = &table_v4[hsiphash(&data, sizeof(u32) * 3, &key) & (table_size - 1)]; + } +#if IS_ENABLED(CONFIG_IPV6) -+ else if (ip_hdr(skb)->version == 6) { -+ action.match = v6_match; -+ action.matchinfo = &ratelimiter->v6_info; ++ else if (skb->protocol == htons(ETH_P_IPV6)) { ++ memcpy(&data.ip, &ipv6_hdr(skb)->saddr, sizeof(__be64)); /* Only 64 bits */ ++ bucket = &table_v6[hsiphash(&data, sizeof(u32) * 3, &key) & (table_size - 1)]; + } +#endif + else + return false; -+ return action.match->match(skb, &action); ++ rcu_read_lock(); ++ hlist_for_each_entry_rcu (entry, bucket, hash) { ++ if (entry->net == net && entry->ip == data.ip) { ++ u64 now, tokens; ++ bool ret; ++ /* Inspired by nft_limit.c, but this is actually a slightly different ++ * algorithm. Namely, we incorporate the burst as part of the maximum ++ * tokens, rather than as part of the rate. */ ++ spin_lock(&entry->lock); ++ now = ktime_get_ns(); ++ tokens = min_t(u64, TOKEN_MAX, entry->tokens + now - entry->last_time_ns); ++ entry->last_time_ns = now; ++ ret = tokens >= PACKET_COST; ++ entry->tokens = ret ? tokens - PACKET_COST : tokens; ++ spin_unlock(&entry->lock); ++ rcu_read_unlock(); ++ return ret; ++ } ++ } ++ rcu_read_unlock(); ++ ++ if (atomic_inc_return(&total_entries) > max_entries) ++ goto err_oom; ++ ++ entry = kmem_cache_alloc(entry_cache, GFP_KERNEL); ++ if (!entry) ++ goto err_oom; ++ ++ entry->net = net; ++ entry->ip = data.ip; ++ INIT_HLIST_NODE(&entry->hash); ++ spin_lock_init(&entry->lock); ++ entry->last_time_ns = ktime_get_ns(); ++ entry->tokens = TOKEN_MAX - PACKET_COST; ++ spin_lock(&table_lock); ++ hlist_add_head_rcu(&entry->hash, bucket); ++ spin_unlock(&table_lock); ++ return true; ++ ++err_oom: ++ atomic_dec(&total_entries); ++ return false; +} + -+int ratelimiter_module_init(void) ++int ratelimiter_init(void) +{ -+ v4_match = xt_request_find_match(NFPROTO_IPV4, "hashlimit", 1); -+ if (IS_ERR(v4_match)) { -+ pr_err("The xt_hashlimit module for IPv4 is required\n"); -+ return PTR_ERR(v4_match); -+ } ++ if (atomic64_inc_return(&refcnt) != 1) ++ return 0; ++ ++ entry_cache = kmem_cache_create("wireguard_ratelimiter", sizeof(struct entry), 0, 0, NULL); ++ if (!entry_cache) ++ goto err; ++ ++ /* xt_hashlimit.c uses a slightly different algorithm for ratelimiting, ++ * but what it shares in common is that it uses a massive hashtable. So, ++ * we borrow their wisdom about good table sizes on different systems ++ * dependent on RAM. This calculation here comes from there. */ ++ table_size = (totalram_pages > (1 << 30) / PAGE_SIZE) ? 8192 : max_t(unsigned long, 16, roundup_pow_of_two((totalram_pages << PAGE_SHIFT) / (1 << 14) / sizeof(struct hlist_head))); ++ max_entries = table_size * 8; ++ ++ table_v4 = kvzalloc(table_size * sizeof(struct hlist_head), GFP_KERNEL); ++ if (!table_v4) ++ goto err_kmemcache; ++ +#if IS_ENABLED(CONFIG_IPV6) -+ v6_match = xt_request_find_match(NFPROTO_IPV6, "hashlimit", 1); -+ if (IS_ERR(v6_match)) { -+ pr_err("The xt_hashlimit module for IPv6 is required\n"); -+ module_put(v4_match->me); -+ return PTR_ERR(v6_match); ++ table_v6 = kvzalloc(table_size * sizeof(struct hlist_head), GFP_KERNEL); ++ if (!table_v6) { ++ kvfree(table_v4); ++ goto err_kmemcache; + } +#endif ++ ++ queue_delayed_work(system_power_efficient_wq, &gc_work, HZ); ++ get_random_bytes(&key, sizeof(key)); + return 0; ++ ++err_kmemcache: ++ kmem_cache_destroy(entry_cache); ++err: ++ atomic64_dec(&refcnt); ++ return -ENOMEM; +} + -+void ratelimiter_module_deinit(void) ++void ratelimiter_uninit(void) +{ -+ module_put(v4_match->me); ++ if (atomic64_dec_return(&refcnt)) ++ return; ++ ++ cancel_delayed_work_sync(&gc_work); ++ gc_entries(NULL); ++ synchronize_rcu(); ++ kvfree(table_v4); +#if IS_ENABLED(CONFIG_IPV6) -+ module_put(v6_match->me); ++ kvfree(table_v6); +#endif ++ kmem_cache_destroy(entry_cache); +} ---- /dev/null -+++ b/net/wireguard/receive.c ++ ++#include "selftest/ratelimiter.h" +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/receive.c 2017-07-06 18:17:33.000000000 +0200 @@ -0,0 +1,311 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + @@ -2564,102 +2571,97 @@ + socket_set_peer_endpoint(peer, &endpoint); +} + -+static inline int skb_data_offset(struct sk_buff *skb, size_t *data_offset, size_t *data_len) ++static inline int skb_prepare_header(struct sk_buff *skb, struct wireguard_device *wg) +{ + struct udphdr *udp; -+ -+ if (unlikely(skb->len < sizeof(struct iphdr))) -+ return -EINVAL; -+ if (unlikely(ip_hdr(skb)->version != 4 && ip_hdr(skb)->version != 6)) -+ return -EINVAL; -+ if (unlikely(ip_hdr(skb)->version == 6 && skb->len < sizeof(struct ipv6hdr))) -+ return -EINVAL; -+ ++ size_t data_offset, data_len; ++ enum message_type message_type; ++ if (unlikely(skb_examine_untrusted_ip_hdr(skb) != skb->protocol || skb_transport_header(skb) < skb->head || (skb_transport_header(skb) + sizeof(struct udphdr)) > skb_tail_pointer(skb))) ++ return -EINVAL; /* Bogus IP header */ + udp = udp_hdr(skb); -+ *data_offset = (u8 *)udp - skb->data; -+ if (unlikely(*data_offset > U16_MAX)) { -+ net_dbg_skb_ratelimited("Packet has offset at impossible location from %pISpfsc\n", skb); ++ data_offset = (u8 *)udp - skb->data; ++ if (unlikely(data_offset > U16_MAX || data_offset + sizeof(struct udphdr) > skb->len)) ++ return -EINVAL; /* Packet has offset at impossible location or isn't big enough to have UDP fields*/ ++ data_len = ntohs(udp->len); ++ if (unlikely(data_len < sizeof(struct udphdr) || data_len > skb->len - data_offset)) ++ return -EINVAL; /* UDP packet is reporting too small of a size or lying about its size */ ++ data_len -= sizeof(struct udphdr); ++ data_offset = (u8 *)udp + sizeof(struct udphdr) - skb->data; ++ if (unlikely(!pskb_may_pull(skb, data_offset + sizeof(struct message_header)) || pskb_trim(skb, data_len + data_offset) < 0)) + return -EINVAL; -+ } -+ if (unlikely(*data_offset + sizeof(struct udphdr) > skb->len)) { -+ net_dbg_skb_ratelimited("Packet isn't big enough to have UDP fields from %pISpfsc\n", skb); ++ skb_pull(skb, data_offset); ++ if (unlikely(skb->len != data_len)) ++ return -EINVAL; /* Final len does not agree with calculated len */ ++ message_type = message_determine_type(skb); ++ __skb_push(skb, data_offset); ++ if (unlikely(!pskb_may_pull(skb, data_offset + message_header_sizes[message_type]))) + return -EINVAL; -+ } -+ *data_len = ntohs(udp->len); -+ if (unlikely(*data_len < sizeof(struct udphdr))) { -+ net_dbg_skb_ratelimited("UDP packet is reporting too small of a size from %pISpfsc\n", skb); -+ return -EINVAL; -+ } -+ if (unlikely(*data_len > skb->len - *data_offset)) { -+ net_dbg_skb_ratelimited("UDP packet is lying about its size from %pISpfsc\n", skb); -+ return -EINVAL; -+ } -+ *data_len -= sizeof(struct udphdr); -+ *data_offset = (u8 *)udp + sizeof(struct udphdr) - skb->data; -+ if (!pskb_may_pull(skb, *data_offset + sizeof(struct message_header))) { -+ net_dbg_skb_ratelimited("Could not pull header into data section from %pISpfsc\n", skb); -+ return -EINVAL; -+ } -+ -+ return 0; ++ __skb_pull(skb, data_offset); ++ return message_type; +} + -+static void receive_handshake_packet(struct wireguard_device *wg, void *data, size_t len, struct sk_buff *skb) ++static void receive_handshake_packet(struct wireguard_device *wg, struct sk_buff *skb) +{ ++ static unsigned long last_under_load = 0; /* Yes this is global, so that our load calculation applies to the whole system. */ + struct wireguard_peer *peer = NULL; + enum message_type message_type; + bool under_load; + enum cookie_mac_state mac_state; + bool packet_needs_cookie; + -+ message_type = message_determine_type(data, len); ++ message_type = message_determine_type(skb); + + if (message_type == MESSAGE_HANDSHAKE_COOKIE) { -+ net_dbg_skb_ratelimited("Receiving cookie response from %pISpfsc\n", skb); -+ cookie_message_consume(data, wg); ++ net_dbg_skb_ratelimited("%s: Receiving cookie response from %pISpfsc\n", netdev_pub(wg)->name, skb); ++ cookie_message_consume((struct message_handshake_cookie *)skb->data, wg); + return; + } + -+ under_load = skb_queue_len(&wg->incoming_handshakes) >= MAX_QUEUED_INCOMING_HANDSHAKES / 2; -+ mac_state = cookie_validate_packet(&wg->cookie_checker, skb, data, len, under_load); ++ under_load = skb_queue_len(&wg->incoming_handshakes) >= MAX_QUEUED_INCOMING_HANDSHAKES / 8; ++ if (under_load) ++ last_under_load = jiffies; ++ else ++ under_load = time_is_after_jiffies(last_under_load + HZ); ++ mac_state = cookie_validate_packet(&wg->cookie_checker, skb, under_load); + if ((under_load && mac_state == VALID_MAC_WITH_COOKIE) || (!under_load && mac_state == VALID_MAC_BUT_NO_COOKIE)) + packet_needs_cookie = false; + else if (under_load && mac_state == VALID_MAC_BUT_NO_COOKIE) + packet_needs_cookie = true; + else { -+ net_dbg_skb_ratelimited("Invalid MAC of handshake, dropping packet from %pISpfsc\n", skb); ++ net_dbg_skb_ratelimited("%s: Invalid MAC of handshake, dropping packet from %pISpfsc\n", netdev_pub(wg)->name, skb); + return; + } + + switch (message_type) { -+ case MESSAGE_HANDSHAKE_INITIATION: ++ case MESSAGE_HANDSHAKE_INITIATION: { ++ struct message_handshake_initiation *message = (struct message_handshake_initiation *)skb->data; + if (packet_needs_cookie) { -+ struct message_handshake_initiation *message = data; -+ packet_send_handshake_cookie(wg, skb, message, sizeof(*message), message->sender_index); ++ packet_send_handshake_cookie(wg, skb, message->sender_index); + return; + } -+ peer = noise_handshake_consume_initiation(data, wg); ++ peer = noise_handshake_consume_initiation(message, wg); + if (unlikely(!peer)) { -+ net_dbg_skb_ratelimited("Invalid handshake initiation from %pISpfsc\n", skb); ++ net_dbg_skb_ratelimited("%s: Invalid handshake initiation from %pISpfsc\n", netdev_pub(wg)->name, skb); + return; + } + update_latest_addr(peer, skb); -+ net_dbg_ratelimited("Receiving handshake initiation from peer %Lu (%pISpfsc)\n", peer->internal_id, &peer->endpoint.addr); ++ net_dbg_ratelimited("%s: Receiving handshake initiation from peer %Lu (%pISpfsc)\n", netdev_pub(wg)->name, peer->internal_id, &peer->endpoint.addr); + packet_send_handshake_response(peer); + break; -+ case MESSAGE_HANDSHAKE_RESPONSE: ++ } ++ case MESSAGE_HANDSHAKE_RESPONSE: { ++ struct message_handshake_response *message = (struct message_handshake_response *)skb->data; + if (packet_needs_cookie) { -+ struct message_handshake_response *message = data; -+ packet_send_handshake_cookie(wg, skb, message, sizeof(*message), message->sender_index); ++ packet_send_handshake_cookie(wg, skb, message->sender_index); + return; + } -+ peer = noise_handshake_consume_response(data, wg); ++ peer = noise_handshake_consume_response(message, wg); + if (unlikely(!peer)) { -+ net_dbg_skb_ratelimited("Invalid handshake response from %pISpfsc\n", skb); ++ net_dbg_skb_ratelimited("%s: Invalid handshake response from %pISpfsc\n", netdev_pub(wg)->name, skb); + return; + } + update_latest_addr(peer, skb); -+ net_dbg_ratelimited("Receiving handshake response from peer %Lu (%pISpfsc)\n", peer->internal_id, &peer->endpoint.addr); ++ net_dbg_ratelimited("%s: Receiving handshake response from peer %Lu (%pISpfsc)\n", netdev_pub(wg)->name, peer->internal_id, &peer->endpoint.addr); + if (noise_handshake_begin_session(&peer->handshake, &peer->keypairs, true)) { + timers_ephemeral_key_created(peer); + timers_handshake_complete(peer); @@ -2671,6 +2673,7 @@ + packet_send_keepalive(peer); + } + break; ++ } + default: + WARN(1, "Somehow a wrong type of packet wound up in the handshake queue!\n"); + return; @@ -2678,7 +2681,7 @@ + + BUG_ON(!peer); + -+ rx_stats(peer, len); ++ rx_stats(peer, skb->len); + timers_any_authenticated_packet_received(peer); + timers_any_authenticated_packet_traversal(peer); + peer_put(peer); @@ -2686,19 +2689,13 @@ + +void packet_process_queued_handshake_packets(struct work_struct *work) +{ -+ struct wireguard_device *wg = container_of(work, struct wireguard_device, incoming_handshakes_work); ++ struct wireguard_device *wg = container_of(work, struct handshake_worker, work)->wg; + struct sk_buff *skb; -+ size_t len, offset; -+ size_t num_processed = 0; + + while ((skb = skb_dequeue(&wg->incoming_handshakes)) != NULL) { -+ if (!skb_data_offset(skb, &offset, &len)) -+ receive_handshake_packet(wg, skb->data + offset, len, skb); ++ receive_handshake_packet(wg, skb); + dev_kfree_skb(skb); -+ if (++num_processed == MAX_BURST_INCOMING_HANDSHAKES) { -+ queue_work(wg->workqueue, &wg->incoming_handshakes_work); -+ return; -+ } ++ cond_resched(); + } +} + @@ -2709,34 +2706,25 @@ + if (peer->sent_lastminute_handshake) + return; + -+ rcu_read_lock(); -+ keypair = rcu_dereference(peer->keypairs.current_keypair); ++ rcu_read_lock_bh(); ++ keypair = rcu_dereference_bh(peer->keypairs.current_keypair); + if (likely(keypair && keypair->sending.is_valid) && keypair->i_am_the_initiator && + unlikely(time_is_before_eq_jiffies64(keypair->sending.birthdate + REJECT_AFTER_TIME - KEEPALIVE_TIMEOUT - REKEY_TIMEOUT))) + send = true; -+ rcu_read_unlock(); ++ rcu_read_unlock_bh(); + + if (send) { + peer->sent_lastminute_handshake = true; -+ packet_queue_handshake_initiation(peer); ++ packet_queue_handshake_initiation(peer, false); + } +} + -+struct packet_cb { -+ u8 ds; -+}; -+#define PACKET_CB(skb) ((struct packet_cb *)skb->cb) -+ -+static void receive_data_packet(struct sk_buff *skb, struct wireguard_peer *peer, struct endpoint *endpoint, bool used_new_key, int err) ++void packet_consume_data_done(struct sk_buff *skb, struct wireguard_peer *peer, struct endpoint *endpoint, bool used_new_key) +{ + struct net_device *dev; + struct wireguard_peer *routed_peer; + struct wireguard_device *wg; -+ -+ if (unlikely(err < 0 || !peer || !endpoint)) { -+ dev_kfree_skb(skb); -+ return; -+ } ++ unsigned int len; + + socket_set_peer_endpoint(peer, endpoint); + @@ -2746,60 +2734,77 @@ + if (unlikely(used_new_key)) { + peer->sent_lastminute_handshake = false; + packet_send_queue(peer); ++ timers_handshake_complete(peer); + } + + keep_key_fresh(peer); + + /* A packet with length 0 is a keepalive packet */ + if (unlikely(!skb->len)) { -+ net_dbg_ratelimited("Receiving keepalive packet from peer %Lu (%pISpfsc)\n", peer->internal_id, &peer->endpoint.addr); ++ net_dbg_ratelimited("%s: Receiving keepalive packet from peer %Lu (%pISpfsc)\n", netdev_pub(peer->device)->name, peer->internal_id, &peer->endpoint.addr); + goto packet_processed; + } + -+ if (!pskb_may_pull(skb, 1 /* For checking the ip version below */)) { -+ ++dev->stats.rx_errors; -+ ++dev->stats.rx_length_errors; -+ net_dbg_ratelimited("Packet missing IP version from peer %Lu (%pISpfsc)\n", peer->internal_id, &peer->endpoint.addr); -+ goto packet_processed; -+ } ++ if (unlikely(skb_network_header(skb) < skb->head)) ++ goto dishonest_packet_size; ++ if (unlikely(!(pskb_network_may_pull(skb, sizeof(struct iphdr)) && (ip_hdr(skb)->version == 4 || (ip_hdr(skb)->version == 6 && pskb_network_may_pull(skb, sizeof(struct ipv6hdr))))))) ++ goto dishonest_packet_type; + + skb->dev = dev; + skb->ip_summed = CHECKSUM_UNNECESSARY; -+ if (skb->len >= sizeof(struct iphdr) && ip_hdr(skb)->version == 4) { -+ skb->protocol = htons(ETH_P_IP); ++ skb->protocol = skb_examine_untrusted_ip_hdr(skb); ++ if (skb->protocol == htons(ETH_P_IP)) { ++ len = ntohs(ip_hdr(skb)->tot_len); ++ if (unlikely(len < sizeof(struct iphdr))) ++ goto dishonest_packet_size; + if (INET_ECN_is_ce(PACKET_CB(skb)->ds)) + IP_ECN_set_ce(ip_hdr(skb)); -+ } else if (skb->len >= sizeof(struct ipv6hdr) && ip_hdr(skb)->version == 6) { -+ skb->protocol = htons(ETH_P_IPV6); ++ ++ } else if (skb->protocol == htons(ETH_P_IPV6)) { ++ len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); + if (INET_ECN_is_ce(PACKET_CB(skb)->ds)) + IP6_ECN_set_ce(skb, ipv6_hdr(skb)); -+ } else { -+ ++dev->stats.rx_errors; -+ ++dev->stats.rx_length_errors; -+ net_dbg_ratelimited("Packet neither ipv4 nor ipv6 from peer %Lu (%pISpfsc)\n", peer->internal_id, &peer->endpoint.addr); -+ goto packet_processed; ++ } else ++ goto dishonest_packet_type; ++ ++ if (unlikely(len > skb->len)) { ++ goto dishonest_packet_size; + } ++ if (len < skb->len && unlikely(pskb_trim(skb, len))) ++ goto packet_processed; + + timers_data_received(peer); + + routed_peer = routing_table_lookup_src(&wg->peer_routing_table, skb); + peer_put(routed_peer); /* We don't need the extra reference. */ + -+ if (unlikely(routed_peer != peer)) { -+ ++dev->stats.rx_errors; -+ ++dev->stats.rx_frame_errors; -+ net_dbg_skb_ratelimited("Packet has unallowed src IP (%pISc) from peer %Lu (%pISpfsc)\n", skb, peer->internal_id, &peer->endpoint.addr); -+ goto packet_processed; -+ } ++ if (unlikely(routed_peer != peer)) ++ goto dishonest_packet_peer; + ++ len = skb->len; + if (likely(netif_rx(skb) == NET_RX_SUCCESS)) -+ rx_stats(peer, skb->len); ++ rx_stats(peer, len); + else { + ++dev->stats.rx_dropped; -+ net_dbg_ratelimited("Failed to give packet to userspace from peer %Lu (%pISpfsc)\n", peer->internal_id, &peer->endpoint.addr); ++ net_dbg_ratelimited("%s: Failed to give packet to userspace from peer %Lu (%pISpfsc)\n", netdev_pub(peer->device)->name, peer->internal_id, &peer->endpoint.addr); + } + goto continue_processing; + ++dishonest_packet_peer: ++ net_dbg_skb_ratelimited("%s: Packet has unallowed src IP (%pISc) from peer %Lu (%pISpfsc)\n", netdev_pub(peer->device)->name, skb, peer->internal_id, &peer->endpoint.addr); ++ ++dev->stats.rx_errors; ++ ++dev->stats.rx_frame_errors; ++ goto packet_processed; ++dishonest_packet_type: ++ net_dbg_ratelimited("%s: Packet is neither ipv4 nor ipv6 from peer %Lu (%pISpfsc)\n", netdev_pub(peer->device)->name, peer->internal_id, &peer->endpoint.addr); ++ ++dev->stats.rx_errors; ++ ++dev->stats.rx_frame_errors; ++ goto packet_processed; ++dishonest_packet_size: ++ net_dbg_ratelimited("%s: Packet has incorrect size from peer %Lu (%pISpfsc)\n", netdev_pub(peer->device)->name, peer->internal_id, &peer->endpoint.addr); ++ ++dev->stats.rx_errors; ++ ++dev->stats.rx_length_errors; ++ goto packet_processed; +packet_processed: + dev_kfree_skb(skb); +continue_processing: @@ -2810,32 +2815,34 @@ + +void packet_receive(struct wireguard_device *wg, struct sk_buff *skb) +{ -+ size_t len, offset; -+ -+ if (unlikely(skb_data_offset(skb, &offset, &len) < 0)) ++ int message_type = skb_prepare_header(skb, wg); ++ if (unlikely(message_type < 0)) + goto err; -+ switch (message_determine_type(skb->data + offset, len)) { ++ switch (message_type) { + case MESSAGE_HANDSHAKE_INITIATION: + case MESSAGE_HANDSHAKE_RESPONSE: -+ case MESSAGE_HANDSHAKE_COOKIE: ++ case MESSAGE_HANDSHAKE_COOKIE: { ++ int cpu_index, cpu, target_cpu; + if (skb_queue_len(&wg->incoming_handshakes) > MAX_QUEUED_INCOMING_HANDSHAKES) { -+ net_dbg_skb_ratelimited("Too many handshakes queued, dropping packet from %pISpfsc\n", skb); -+ goto err; -+ } -+ if (skb_linearize(skb) < 0) { -+ net_dbg_skb_ratelimited("Unable to linearize handshake skb from %pISpfsc\n", skb); ++ net_dbg_skb_ratelimited("%s: Too many handshakes queued, dropping packet from %pISpfsc\n", netdev_pub(wg)->name, skb); + goto err; + } + skb_queue_tail(&wg->incoming_handshakes, skb); ++ /* Select the CPU in a round-robin */ ++ cpu_index = ((unsigned int)atomic_inc_return(&wg->incoming_handshake_seqnr)) % cpumask_weight(cpu_online_mask); ++ target_cpu = cpumask_first(cpu_online_mask); ++ for (cpu = 0; cpu < cpu_index; ++cpu) ++ target_cpu = cpumask_next(target_cpu, cpu_online_mask); + /* Queues up a call to packet_process_queued_handshake_packets(skb): */ -+ queue_work(wg->workqueue, &wg->incoming_handshakes_work); ++ queue_work_on(target_cpu, wg->incoming_handshake_wq, &per_cpu_ptr(wg->incoming_handshakes_worker, target_cpu)->work); + break; ++ } + case MESSAGE_DATA: + PACKET_CB(skb)->ds = ip_tunnel_get_dsfield(ip_hdr(skb), skb); -+ packet_consume_data(skb, offset, wg, receive_data_packet); ++ packet_consume_data(skb, wg); + break; + default: -+ net_dbg_skb_ratelimited("Invalid packet from %pISpfsc\n", skb); ++ net_dbg_skb_ratelimited("%s: Invalid packet from %pISpfsc\n", netdev_pub(wg)->name, skb); + goto err; + } + return; @@ -2843,9 +2850,9 @@ +err: + dev_kfree_skb(skb); +} ---- /dev/null -+++ b/net/wireguard/routingtable.c -@@ -0,0 +1,516 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/routingtable.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,345 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#include "routingtable.h" @@ -2855,16 +2862,10 @@ + struct routing_table_node __rcu *bit[2]; + struct rcu_head rcu; + struct wireguard_peer *peer; -+ u8 cidr; -+ u8 bit_at_a, bit_at_b; -+ bool incidental; -+ u8 bits[]; ++ u8 cidr, bit_at_a, bit_at_b; ++ u8 bits[] __aligned(__alignof__(u64)); +}; + -+static inline u8 bit_at(const u8 *key, u8 a, u8 b) -+{ -+ return (key[a] >> b) & 1; -+} +static inline void copy_and_assign_cidr(struct routing_table_node *node, const u8 *src, u8 cidr) +{ + memcpy(node->bits, src, (cidr + 7) / 8); @@ -2873,63 +2874,77 @@ + node->bit_at_a = cidr / 8; + node->bit_at_b = 7 - (cidr % 8); +} ++#define choose_node(parent, key) parent->bit[(key[parent->bit_at_a] >> parent->bit_at_b) & 1] + -+/* Non-recursive RCU expansion of: -+ * -+ * free_node(node) -+ * { -+ * if (!node) -+ * return; -+ * free_node(node->bit[0]); -+ * free_node(node->bit[1]); -+ * kfree_rcu(node); -+ * } -+ */ -+#define ref(p) rcu_access_pointer(p) -+#define push(p) do { BUG_ON(len >= 128); stack[len++] = rcu_dereference_protected(p, lockdep_is_held(lock)); } while (0) -+static void free_node(struct routing_table_node *top, struct mutex *lock) ++static void node_free_rcu(struct rcu_head *rcu) +{ -+ struct routing_table_node *stack[128]; -+ struct routing_table_node *node = NULL; -+ struct routing_table_node *prev = NULL; -+ unsigned int len = 0; ++ kfree(container_of(rcu, struct routing_table_node, rcu)); ++} ++#define push(p, lock) ({ \ ++ if (rcu_access_pointer(p)) { \ ++ BUG_ON(len >= 128); \ ++ stack[len++] = lock ? rcu_dereference_protected(p, lockdep_is_held((struct mutex *)lock)) : rcu_dereference_bh(p); \ ++ } \ ++ true; \ ++}) ++#define walk_prep \ ++ struct routing_table_node *stack[128], *node; \ ++ unsigned int len; ++#define walk(top, lock) for (len = 0, push(top, lock); len > 0 && (node = stack[--len]) && push(node->bit[0], lock) && push(node->bit[1], lock);) + -+ if (!top) -+ return; ++static void free_root_node(struct routing_table_node __rcu *top, struct mutex *lock) ++{ ++ walk_prep; ++ walk (top, lock) ++ call_rcu_bh(&node->rcu, node_free_rcu); ++} + -+ stack[len++] = top; -+ while (len > 0) { -+ node = stack[len - 1]; -+ if (!prev || ref(prev->bit[0]) == node || ref(prev->bit[1]) == node) { -+ if (ref(node->bit[0])) -+ push(node->bit[0]); -+ else if (ref(node->bit[1])) -+ push(node->bit[1]); -+ } else if (ref(node->bit[0]) == prev) { -+ if (ref(node->bit[1])) -+ push(node->bit[1]); -+ } else { -+ kfree_rcu(node, rcu); -+ --len; -+ } -+ prev = node; ++static size_t count_nodes(struct routing_table_node __rcu *top) ++{ ++ size_t ret = 0; ++ walk_prep; ++ walk (top, NULL) { ++ if (node->peer) ++ ++ret; + } ++ return ret; ++} ++ ++static int walk_ips_by_peer(struct routing_table_node __rcu *top, int family, void *ctx, struct wireguard_peer *peer, int (*func)(void *ctx, union nf_inet_addr ip, u8 cidr, int family), struct mutex *maybe_lock) ++{ ++ int ret; ++ union nf_inet_addr ip = { .all = { 0 } }; ++ walk_prep; ++ ++ if (unlikely(!peer)) ++ return 0; ++ ++ walk (top, maybe_lock) { ++ if (node->peer != peer) ++ continue; ++ memcpy(ip.all, node->bits, family == AF_INET6 ? 16 : 4); ++ ret = func(ctx, ip, node->cidr, family); ++ if (ret) ++ return ret; ++ } ++ return 0; +} +#undef push -+#define push(p) do { BUG_ON(len >= 128); stack[len++] = p; } while (0) -+static bool walk_remove_by_peer(struct routing_table_node __rcu **top, struct wireguard_peer *peer, struct mutex *lock) -+{ -+ struct routing_table_node __rcu **stack[128]; -+ struct routing_table_node __rcu **nptr; -+ struct routing_table_node *node = NULL; -+ struct routing_table_node *prev = NULL; -+ unsigned int len = 0; -+ bool ret = false; + -+ stack[len++] = top; -+ while (len > 0) { ++#define ref(p) rcu_access_pointer(p) ++#define deref(p) rcu_dereference_protected(*p, lockdep_is_held(lock)) ++#define push(p) ({ BUG_ON(len >= 128); stack[len++] = p; }) ++static void walk_remove_by_peer(struct routing_table_node __rcu **top, struct wireguard_peer *peer, struct mutex *lock) ++{ ++ struct routing_table_node __rcu **stack[128], **nptr, *node, *prev; ++ unsigned int len; ++ ++ if (unlikely(!peer || !ref(*top))) ++ return; ++ ++ for (prev = NULL, len = 0, push(top); len > 0; prev = node) { + nptr = stack[len - 1]; -+ node = rcu_dereference_protected(*nptr, lockdep_is_held(lock)); ++ node = deref(nptr); + if (!node) { + --len; + continue; @@ -2944,114 +2959,78 @@ + push(&node->bit[1]); + } else { + if (node->peer == peer) { -+ ret = true; + node->peer = NULL; -+ node->incidental = true; + if (!node->bit[0] || !node->bit[1]) { -+ /* collapse (even if both are null) */ -+ rcu_assign_pointer(*nptr, rcu_dereference_protected(node->bit[!node->bit[0]], lockdep_is_held(lock))); -+ rcu_assign_pointer(node->bit[0], NULL); -+ rcu_assign_pointer(node->bit[1], NULL); -+ free_node(node, lock); ++ rcu_assign_pointer(*nptr, deref(&node->bit[!ref(node->bit[0])])); ++ call_rcu_bh(&node->rcu, node_free_rcu); ++ node = deref(nptr); + } + } + --len; + } -+ prev = node; + } -+ -+ return ret; +} +#undef ref ++#undef deref +#undef push + -+static inline bool match(const struct routing_table_node *node, const u8 *key, u8 match_len) ++static inline unsigned int fls128(u64 a, u64 b) +{ -+ u8 full_blocks_to_match = match_len / 8; -+ u8 bits_leftover = match_len % 8; -+ u8 mask; -+ const u8 *a = node->bits, *b = key; -+ if (memcmp(a, b, full_blocks_to_match)) -+ return false; -+ if (!bits_leftover) -+ return true; -+ mask = ~(0xff >> bits_leftover); -+ return (a[full_blocks_to_match] & mask) == (b[full_blocks_to_match] & mask); ++ return a ? fls64(a) + 64 : fls64(b); +} + -+static inline u8 common_bits(const struct routing_table_node *node, const u8 *key, u8 match_len) ++static inline u8 common_bits(const struct routing_table_node *node, const u8 *key, u8 bits) +{ -+ u8 max = (((match_len > node->cidr) ? match_len : node->cidr) + 7) / 8; -+ u8 bits = 0; -+ u8 i, mask; -+ const u8 *a = node->bits, *b = key; -+ for (i = 0; i < max; ++i, bits += 8) { -+ if (a[i] != b[i]) -+ break; -+ } -+ if (i == max) -+ return bits; -+ for (mask = 128; mask > 0; mask /= 2, ++bits) { -+ if ((a[i] & mask) != (b[i] & mask)) -+ return bits; -+ } ++ if (bits == 32) ++ return 32 - fls(be32_to_cpu(*(const __be32 *)node->bits ^ *(const __be32 *)key)); ++ else if (bits == 128) ++ return 128 - fls128(be64_to_cpu(*(const __be64 *)&node->bits[0] ^ *(const __be64 *)&key[0]), be64_to_cpu(*(const __be64 *)&node->bits[8] ^ *(const __be64 *)&key[8])); + BUG(); -+ return bits; -+} -+ -+static int remove(struct routing_table_node __rcu **trie, const u8 *key, u8 cidr, struct mutex *lock) -+{ -+ struct routing_table_node *parent = NULL, *node; -+ node = rcu_dereference_protected(*trie, lockdep_is_held(lock)); -+ while (node && node->cidr <= cidr && match(node, key, node->cidr)) { -+ if (node->cidr == cidr) { -+ /* exact match */ -+ node->incidental = true; -+ node->peer = NULL; -+ if (!node->bit[0] || !node->bit[1]) { -+ /* collapse (even if both are null) */ -+ if (parent) -+ rcu_assign_pointer(parent->bit[bit_at(key, parent->bit_at_a, parent->bit_at_b)], -+ rcu_dereference_protected(node->bit[(!node->bit[0]) ? 1 : 0], lockdep_is_held(lock))); -+ rcu_assign_pointer(node->bit[0], NULL); -+ rcu_assign_pointer(node->bit[1], NULL); -+ free_node(node, lock); -+ } -+ return 0; -+ } -+ parent = node; -+ node = rcu_dereference_protected(parent->bit[bit_at(key, parent->bit_at_a, parent->bit_at_b)], lockdep_is_held(lock)); -+ } -+ return -ENOENT; ++ return 0; +} + +static inline struct routing_table_node *find_node(struct routing_table_node *trie, u8 bits, const u8 *key) +{ + struct routing_table_node *node = trie, *found = NULL; -+ while (node && match(node, key, node->cidr)) { -+ if (!node->incidental) ++ ++ while (node && common_bits(node, key, bits) >= node->cidr) { ++ if (node->peer) + found = node; + if (node->cidr == bits) + break; -+ node = rcu_dereference(node->bit[bit_at(key, node->bit_at_a, node->bit_at_b)]); ++ node = rcu_dereference_bh(choose_node(node, key)); + } + return found; +} + -+static inline bool node_placement(struct routing_table_node __rcu *trie, const u8 *key, u8 cidr, struct routing_table_node **rnode, struct mutex *lock) ++/* Returns a strong reference to a peer */ ++static inline struct wireguard_peer *lookup(struct routing_table_node __rcu *root, u8 bits, const void *ip) ++{ ++ struct wireguard_peer *peer = NULL; ++ struct routing_table_node *node; ++ ++ rcu_read_lock_bh(); ++ node = find_node(rcu_dereference_bh(root), bits, ip); ++ if (node) ++ peer = peer_get(node->peer); ++ rcu_read_unlock_bh(); ++ return peer; ++} ++ ++static inline bool node_placement(struct routing_table_node __rcu *trie, const u8 *key, u8 cidr, u8 bits, struct routing_table_node **rnode, struct mutex *lock) +{ + bool exact = false; + struct routing_table_node *parent = NULL, *node = rcu_dereference_protected(trie, lockdep_is_held(lock)); -+ while (node && node->cidr <= cidr && match(node, key, node->cidr)) { ++ ++ while (node && node->cidr <= cidr && common_bits(node, key, bits) >= node->cidr) { + parent = node; + if (parent->cidr == cidr) { + exact = true; + break; + } -+ node = rcu_dereference_protected(parent->bit[bit_at(key, parent->bit_at_a, parent->bit_at_b)], lockdep_is_held(lock)); ++ node = rcu_dereference_protected(choose_node(parent, key), lockdep_is_held(lock)); + } -+ if (rnode) -+ *rnode = parent; ++ *rnode = parent; + return exact; +} + @@ -3068,9 +3047,7 @@ + rcu_assign_pointer(*trie, node); + return 0; + } -+ if (node_placement(*trie, key, cidr, &node, lock)) { -+ /* exact match */ -+ node->incidental = false; ++ if (node_placement(*trie, key, cidr, bits, &node, lock)) { + node->peer = peer; + return 0; + } @@ -3083,112 +3060,40 @@ + + if (!node) + down = rcu_dereference_protected(*trie, lockdep_is_held(lock)); -+ else -+ down = rcu_dereference_protected(node->bit[bit_at(key, node->bit_at_a, node->bit_at_b)], lockdep_is_held(lock)); -+ if (!down) { -+ rcu_assign_pointer(node->bit[bit_at(key, node->bit_at_a, node->bit_at_b)], newnode); -+ return 0; ++ else { ++ down = rcu_dereference_protected(choose_node(node, key), lockdep_is_held(lock)); ++ if (!down) { ++ rcu_assign_pointer(choose_node(node, key), newnode); ++ return 0; ++ } + } -+ /* here we must be inserting between node and down */ -+ cidr = min(cidr, common_bits(down, key, cidr)); ++ cidr = min(cidr, common_bits(down, key, bits)); + parent = node; + -+ /* we either need to make a new branch above down and newnode -+ * or newnode can be the branch. newnode can be the branch if -+ * its cidr == bits_in_common */ + if (newnode->cidr == cidr) { -+ /* newnode can be the branch */ -+ rcu_assign_pointer(newnode->bit[bit_at(down->bits, newnode->bit_at_a, newnode->bit_at_b)], down); ++ rcu_assign_pointer(choose_node(newnode, down->bits), down); + if (!parent) + rcu_assign_pointer(*trie, newnode); + else -+ rcu_assign_pointer(parent->bit[bit_at(newnode->bits, parent->bit_at_a, parent->bit_at_b)], newnode); ++ rcu_assign_pointer(choose_node(parent, newnode->bits), newnode); + } else { -+ /* reparent */ + node = kzalloc(sizeof(*node) + (bits + 7) / 8, GFP_KERNEL); + if (!node) { + kfree(newnode); + return -ENOMEM; + } -+ node->incidental = true; + copy_and_assign_cidr(node, newnode->bits, cidr); + -+ rcu_assign_pointer(node->bit[bit_at(down->bits, node->bit_at_a, node->bit_at_b)], down); -+ rcu_assign_pointer(node->bit[bit_at(newnode->bits, node->bit_at_a, node->bit_at_b)], newnode); ++ rcu_assign_pointer(choose_node(node, down->bits), down); ++ rcu_assign_pointer(choose_node(node, newnode->bits), newnode); + if (!parent) + rcu_assign_pointer(*trie, node); + else -+ rcu_assign_pointer(parent->bit[bit_at(node->bits, parent->bit_at_a, parent->bit_at_b)], node); ++ rcu_assign_pointer(choose_node(parent, node->bits), node); + } + return 0; +} + -+#define push(p) do { \ -+ struct routing_table_node *next = (maybe_lock ? rcu_dereference_protected(p, lockdep_is_held(maybe_lock)) : rcu_dereference(p)); \ -+ if (next) { \ -+ BUG_ON(len >= 128); \ -+ stack[len++] = next; \ -+ } \ -+} while (0) -+static int walk_ips(struct routing_table_node *top, int family, void *ctx, int (*func)(void *ctx, struct wireguard_peer *peer, union nf_inet_addr ip, u8 cidr, int family), struct mutex *maybe_lock) -+{ -+ int ret; -+ union nf_inet_addr ip = { .all = { 0 } }; -+ struct routing_table_node *stack[128]; -+ struct routing_table_node *node; -+ unsigned int len = 0; -+ struct wireguard_peer *peer; -+ -+ if (!top) -+ return 0; -+ -+ stack[len++] = top; -+ while (len > 0) { -+ node = stack[--len]; -+ -+ peer = peer_get(node->peer); -+ if (peer) { -+ memcpy(ip.all, node->bits, family == AF_INET6 ? 16 : 4); -+ ret = func(ctx, peer, ip, node->cidr, family); -+ peer_put(peer); -+ if (ret) -+ return ret; -+ } -+ -+ push(node->bit[0]); -+ push(node->bit[1]); -+ } -+ return 0; -+} -+static int walk_ips_by_peer(struct routing_table_node *top, int family, void *ctx, struct wireguard_peer *peer, int (*func)(void *ctx, union nf_inet_addr ip, u8 cidr, int family), struct mutex *maybe_lock) -+{ -+ int ret; -+ union nf_inet_addr ip = { .all = { 0 } }; -+ struct routing_table_node *stack[128]; -+ struct routing_table_node *node; -+ unsigned int len = 0; -+ -+ if (!top) -+ return 0; -+ -+ stack[len++] = top; -+ while (len > 0) { -+ node = stack[--len]; -+ -+ if (node->peer == peer) { -+ memcpy(ip.all, node->bits, family == AF_INET6 ? 16 : 4); -+ ret = func(ctx, ip, node->cidr, family); -+ if (ret) -+ return ret; -+ } -+ -+ push(node->bit[0]); -+ push(node->bit[1]); -+ } -+ return 0; -+} -+#undef push -+ +void routing_table_init(struct routing_table *table) +{ + memset(table, 0, sizeof(struct routing_table)); @@ -3198,9 +3103,9 @@ +void routing_table_free(struct routing_table *table) +{ + mutex_lock(&table->table_update_lock); -+ free_node(rcu_dereference_protected(table->root4, lockdep_is_held(&table->table_update_lock)), &table->table_update_lock); ++ free_root_node(table->root4, &table->table_update_lock); + rcu_assign_pointer(table->root4, NULL); -+ free_node(rcu_dereference_protected(table->root6, lockdep_is_held(&table->table_update_lock)), &table->table_update_lock); ++ free_root_node(table->root6, &table->table_update_lock); + rcu_assign_pointer(table->root6, NULL); + mutex_unlock(&table->table_update_lock); +} @@ -3208,7 +3113,7 @@ +int routing_table_insert_v4(struct routing_table *table, const struct in_addr *ip, u8 cidr, struct wireguard_peer *peer) +{ + int ret; -+ if (cidr > 32) ++ if (unlikely(cidr > 32 || !peer)) + return -EINVAL; + mutex_lock(&table->table_update_lock); + ret = add(&table->root4, 32, (const u8 *)ip, cidr, peer, &table->table_update_lock); @@ -3219,7 +3124,7 @@ +int routing_table_insert_v6(struct routing_table *table, const struct in6_addr *ip, u8 cidr, struct wireguard_peer *peer) +{ + int ret; -+ if (cidr > 128) ++ if (unlikely(cidr > 128 || !peer)) + return -EINVAL; + mutex_lock(&table->table_update_lock); + ret = add(&table->root6, 128, (const u8 *)ip, cidr, peer, &table->table_update_lock); @@ -3227,88 +3132,34 @@ + return ret; +} + -+/* Returns a strong reference to a peer */ -+inline struct wireguard_peer *routing_table_lookup_v4(struct routing_table *table, const struct in_addr *ip) ++void routing_table_remove_by_peer(struct routing_table *table, struct wireguard_peer *peer) +{ -+ struct wireguard_peer *peer = NULL; -+ struct routing_table_node *node; -+ -+ rcu_read_lock(); -+ node = find_node(rcu_dereference(table->root4), 32, (const u8 *)ip); -+ if (node) -+ peer = peer_get(node->peer); -+ rcu_read_unlock(); -+ return peer; -+} -+ -+/* Returns a strong reference to a peer */ -+inline struct wireguard_peer *routing_table_lookup_v6(struct routing_table *table, const struct in6_addr *ip) -+{ -+ struct wireguard_peer *peer = NULL; -+ struct routing_table_node *node; -+ -+ rcu_read_lock(); -+ node = find_node(rcu_dereference(table->root6), 128, (const u8 *)ip); -+ if (node) -+ peer = peer_get(node->peer); -+ rcu_read_unlock(); -+ return peer; -+} -+ -+int routing_table_remove_v4(struct routing_table *table, const struct in_addr *ip, u8 cidr) -+{ -+ int ret; + mutex_lock(&table->table_update_lock); -+ ret = remove(&table->root4, (const u8 *)ip, cidr, &table->table_update_lock); ++ walk_remove_by_peer(&table->root4, peer, &table->table_update_lock); ++ walk_remove_by_peer(&table->root6, peer, &table->table_update_lock); + mutex_unlock(&table->table_update_lock); -+ return ret; +} + -+int routing_table_remove_v6(struct routing_table *table, const struct in6_addr *ip, u8 cidr) ++size_t routing_table_count_nodes(struct routing_table *table) +{ -+ int ret; -+ mutex_lock(&table->table_update_lock); -+ ret = remove(&table->root6, (const u8 *)ip, cidr, &table->table_update_lock); -+ mutex_unlock(&table->table_update_lock); -+ return ret; -+} -+ -+int routing_table_remove_by_peer(struct routing_table *table, struct wireguard_peer *peer) -+{ -+ bool found; -+ mutex_lock(&table->table_update_lock); -+ found = walk_remove_by_peer(&table->root4, peer, &table->table_update_lock) | walk_remove_by_peer(&table->root6, peer, &table->table_update_lock); -+ mutex_unlock(&table->table_update_lock); -+ return found ? 0 : -EINVAL; -+} -+ -+/* Calls func with a strong reference to each peer, before putting it when the function has completed. -+ * It's thus up to the caller to call peer_put on it if it's going to be used elsewhere after or stored. */ -+int routing_table_walk_ips(struct routing_table *table, void *ctx, int (*func)(void *ctx, struct wireguard_peer *peer, union nf_inet_addr ip, u8 cidr, int family)) -+{ -+ int ret; -+ rcu_read_lock(); -+ ret = walk_ips(rcu_dereference(table->root4), AF_INET, ctx, func, NULL); -+ rcu_read_unlock(); -+ if (ret) -+ return ret; -+ rcu_read_lock(); -+ ret = walk_ips(rcu_dereference(table->root6), AF_INET6, ctx, func, NULL); -+ rcu_read_unlock(); ++ size_t ret; ++ rcu_read_lock_bh(); ++ ret = count_nodes(table->root4) + count_nodes(table->root6); ++ rcu_read_unlock_bh(); + return ret; +} + +int routing_table_walk_ips_by_peer(struct routing_table *table, void *ctx, struct wireguard_peer *peer, int (*func)(void *ctx, union nf_inet_addr ip, u8 cidr, int family)) +{ + int ret; -+ rcu_read_lock(); -+ ret = walk_ips_by_peer(rcu_dereference(table->root4), AF_INET, ctx, peer, func, NULL); -+ rcu_read_unlock(); ++ rcu_read_lock_bh(); ++ ret = walk_ips_by_peer(table->root4, AF_INET, ctx, peer, func, NULL); ++ rcu_read_unlock_bh(); + if (ret) + return ret; -+ rcu_read_lock(); -+ ret = walk_ips_by_peer(rcu_dereference(table->root6), AF_INET6, ctx, peer, func, NULL); -+ rcu_read_unlock(); ++ rcu_read_lock_bh(); ++ ret = walk_ips_by_peer(table->root6, AF_INET6, ctx, peer, func, NULL); ++ rcu_read_unlock_bh(); + return ret; +} + @@ -3316,55 +3167,40 @@ +{ + int ret; + mutex_lock(&table->table_update_lock); -+ ret = walk_ips_by_peer(rcu_dereference_protected(table->root4, lockdep_is_held(&table->table_update_lock)), AF_INET, ctx, peer, func, &table->table_update_lock); ++ ret = walk_ips_by_peer(table->root4, AF_INET, ctx, peer, func, &table->table_update_lock); + mutex_unlock(&table->table_update_lock); + if (ret) + return ret; + mutex_lock(&table->table_update_lock); -+ ret = walk_ips_by_peer(rcu_dereference_protected(table->root6, lockdep_is_held(&table->table_update_lock)), AF_INET6, ctx, peer, func, &table->table_update_lock); ++ ret = walk_ips_by_peer(table->root6, AF_INET6, ctx, peer, func, &table->table_update_lock); + mutex_unlock(&table->table_update_lock); + return ret; +} + -+static inline bool has_valid_ip_header(struct sk_buff *skb) -+{ -+ if (unlikely(skb->len < sizeof(struct iphdr))) -+ return false; -+ else if (unlikely(skb->len < sizeof(struct ipv6hdr) && ip_hdr(skb)->version == 6)) -+ return false; -+ else if (unlikely(ip_hdr(skb)->version != 4 && ip_hdr(skb)->version != 6)) -+ return false; -+ return true; -+} -+ +/* Returns a strong reference to a peer */ +struct wireguard_peer *routing_table_lookup_dst(struct routing_table *table, struct sk_buff *skb) +{ -+ if (unlikely(!has_valid_ip_header(skb))) -+ return NULL; -+ if (ip_hdr(skb)->version == 4) -+ return routing_table_lookup_v4(table, (struct in_addr *)&ip_hdr(skb)->daddr); -+ else if (ip_hdr(skb)->version == 6) -+ return routing_table_lookup_v6(table, &ipv6_hdr(skb)->daddr); ++ if (skb->protocol == htons(ETH_P_IP)) ++ return lookup(table->root4, 32, &ip_hdr(skb)->daddr); ++ else if (skb->protocol == htons(ETH_P_IPV6)) ++ return lookup(table->root6, 128, &ipv6_hdr(skb)->daddr); + return NULL; +} + +/* Returns a strong reference to a peer */ +struct wireguard_peer *routing_table_lookup_src(struct routing_table *table, struct sk_buff *skb) +{ -+ if (unlikely(!has_valid_ip_header(skb))) -+ return NULL; -+ if (ip_hdr(skb)->version == 4) -+ return routing_table_lookup_v4(table, (struct in_addr *)&ip_hdr(skb)->saddr); -+ else if (ip_hdr(skb)->version == 6) -+ return routing_table_lookup_v6(table, &ipv6_hdr(skb)->saddr); ++ if (skb->protocol == htons(ETH_P_IP)) ++ return lookup(table->root4, 32, &ip_hdr(skb)->saddr); ++ else if (skb->protocol == htons(ETH_P_IPV6)) ++ return lookup(table->root6, 128, &ipv6_hdr(skb)->saddr); + return NULL; +} + -+#include "selftest/routing-table.h" ---- /dev/null -+++ b/net/wireguard/send.c -@@ -0,0 +1,192 @@ ++#include "selftest/routingtable.h" +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/send.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,197 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#include "packets.h" @@ -3394,7 +3230,7 @@ + peer->last_sent_handshake = get_jiffies_64(); + up_write(&peer->handshake.lock); + -+ net_dbg_ratelimited("Sending handshake initiation to peer %Lu (%pISpfsc)\n", peer->internal_id, &peer->endpoint.addr); ++ net_dbg_ratelimited("%s: Sending handshake initiation to peer %Lu (%pISpfsc)\n", netdev_pub(peer->device)->name, peer->internal_id, &peer->endpoint.addr); + + if (noise_handshake_create_initiation(&packet, &peer->handshake)) { + cookie_add_mac_to_packet(&packet, sizeof(packet), peer); @@ -3411,8 +3247,11 @@ + peer_put(peer); +} + -+void packet_queue_handshake_initiation(struct wireguard_peer *peer) ++void packet_queue_handshake_initiation(struct wireguard_peer *peer, bool is_retry) +{ ++ if (!is_retry) ++ peer->timer_handshake_attempts = 0; ++ + /* First checking the timestamp here is just an optimization; it will + * be caught while properly locked inside the actual work queue. */ + if (!time_is_before_jiffies64(peer->last_sent_handshake + REKEY_TIMEOUT)) @@ -3423,7 +3262,7 @@ + return; + + /* Queues up calling packet_send_queued_handshakes(peer), where we do a peer_put(peer) after: */ -+ if (!queue_work(peer->device->workqueue, &peer->transmit_handshake_work)) ++ if (!queue_work(peer->device->peer_wq, &peer->transmit_handshake_work)) + peer_put(peer); /* If the work was already queued, we want to drop the extra reference */ +} + @@ -3431,7 +3270,7 @@ +{ + struct message_handshake_response packet; + -+ net_dbg_ratelimited("Sending handshake response to peer %Lu (%pISpfsc)\n", peer->internal_id, &peer->endpoint.addr); ++ net_dbg_ratelimited("%s: Sending handshake response to peer %Lu (%pISpfsc)\n", netdev_pub(peer->device)->name, peer->internal_id, &peer->endpoint.addr); + peer->last_sent_handshake = get_jiffies_64(); + + if (noise_handshake_create_response(&packet, &peer->handshake)) { @@ -3444,12 +3283,12 @@ + } +} + -+void packet_send_handshake_cookie(struct wireguard_device *wg, struct sk_buff *initiating_skb, void *data, size_t data_len, __le32 sender_index) ++void packet_send_handshake_cookie(struct wireguard_device *wg, struct sk_buff *initiating_skb, __le32 sender_index) +{ + struct message_handshake_cookie packet; + -+ net_dbg_skb_ratelimited("Sending cookie response for denied handshake message for %pISpfsc\n", initiating_skb); -+ cookie_message_create(&packet, initiating_skb, data, data_len, sender_index, &wg->cookie_checker); ++ net_dbg_skb_ratelimited("%s: Sending cookie response for denied handshake message for %pISpfsc\n", netdev_pub(wg)->name, initiating_skb); ++ cookie_message_create(&packet, initiating_skb, sender_index, &wg->cookie_checker); + socket_send_buffer_as_reply_to_skb(wg, initiating_skb, &packet, sizeof(packet)); +} + @@ -3458,16 +3297,16 @@ + struct noise_keypair *keypair; + bool send = false; + -+ rcu_read_lock(); -+ keypair = rcu_dereference(peer->keypairs.current_keypair); ++ rcu_read_lock_bh(); ++ keypair = rcu_dereference_bh(peer->keypairs.current_keypair); + if (likely(keypair && keypair->sending.is_valid) && + (unlikely(atomic64_read(&keypair->sending.counter.counter) > REKEY_AFTER_MESSAGES) || + (keypair->i_am_the_initiator && unlikely(time_is_before_eq_jiffies64(keypair->sending.birthdate + REKEY_AFTER_TIME))))) + send = true; -+ rcu_read_unlock(); ++ rcu_read_unlock_bh(); + + if (send) -+ packet_queue_handshake_initiation(peer); ++ packet_queue_handshake_initiation(peer, false); +} + +void packet_send_keepalive(struct wireguard_peer *peer) @@ -3480,20 +3319,23 @@ + skb_reserve(skb, DATA_PACKET_HEAD_ROOM); + skb->dev = netdev_pub(peer->device); + skb_queue_tail(&peer->tx_packet_queue, skb); -+ net_dbg_ratelimited("Sending keepalive packet to peer %Lu (%pISpfsc)\n", peer->internal_id, &peer->endpoint.addr); ++ net_dbg_ratelimited("%s: Sending keepalive packet to peer %Lu (%pISpfsc)\n", netdev_pub(peer->device)->name, peer->internal_id, &peer->endpoint.addr); + } + packet_send_queue(peer); +} + -+static void message_create_data_done(struct sk_buff_head *queue, struct wireguard_peer *peer) ++void packet_create_data_done(struct sk_buff_head *queue, struct wireguard_peer *peer) +{ + struct sk_buff *skb, *tmp; + bool is_keepalive, data_sent = false; + ++ if (unlikely(!skb_queue_len(queue))) ++ return; ++ + timers_any_authenticated_packet_traversal(peer); -+ skb_queue_walk_safe(queue, skb, tmp) { ++ skb_queue_walk_safe (queue, skb, tmp) { + is_keepalive = skb->len == message_data_len(0); -+ if (likely(!socket_send_skb_to_peer(peer, skb, *(u8 *)skb->cb) && !is_keepalive)) ++ if (likely(!socket_send_skb_to_peer(peer, skb, PACKET_CB(skb)->ds) && !is_keepalive)) + data_sent = true; + } + if (likely(data_sent)) @@ -3508,21 +3350,20 @@ +void packet_send_queue(struct wireguard_peer *peer) +{ + struct sk_buff_head queue; -+ unsigned long flags; + + peer->need_resend_queue = false; + + /* Steal the current queue into our local one. */ + skb_queue_head_init(&queue); -+ spin_lock_irqsave(&peer->tx_packet_queue.lock, flags); ++ spin_lock_bh(&peer->tx_packet_queue.lock); + skb_queue_splice_init(&peer->tx_packet_queue, &queue); -+ spin_unlock_irqrestore(&peer->tx_packet_queue.lock, flags); ++ spin_unlock_bh(&peer->tx_packet_queue.lock); + + if (unlikely(!skb_queue_len(&queue))) + return; + + /* We submit it for encryption and sending. */ -+ switch (packet_create_data(&queue, peer, message_create_data_done)) { ++ switch (packet_create_data(&queue, peer)) { + case 0: + break; + case -EBUSY: @@ -3536,19 +3377,19 @@ + /* We stick the remaining skbs from local_queue at the top of the peer's + * queue again, setting the top of local_queue to be the skb that begins + * the requeueing. */ -+ spin_lock_irqsave(&peer->tx_packet_queue.lock, flags); ++ spin_lock_bh(&peer->tx_packet_queue.lock); + skb_queue_splice(&queue, &peer->tx_packet_queue); -+ spin_unlock_irqrestore(&peer->tx_packet_queue.lock, flags); ++ spin_unlock_bh(&peer->tx_packet_queue.lock); + break; + case -ENOKEY: + /* ENOKEY means that we don't have a valid session for the peer, which + * means we should initiate a session, but after requeuing like above. */ + -+ spin_lock_irqsave(&peer->tx_packet_queue.lock, flags); ++ spin_lock_bh(&peer->tx_packet_queue.lock); + skb_queue_splice(&queue, &peer->tx_packet_queue); -+ spin_unlock_irqrestore(&peer->tx_packet_queue.lock, flags); ++ spin_unlock_bh(&peer->tx_packet_queue.lock); + -+ packet_queue_handshake_initiation(peer); ++ packet_queue_handshake_initiation(peer, false); + break; + default: + /* If we failed for any other reason, we want to just free the packets and @@ -3557,9 +3398,9 @@ + __skb_queue_purge(&queue); + } +} ---- /dev/null -+++ b/net/wireguard/socket.c -@@ -0,0 +1,385 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/socket.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,387 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#include "device.h" @@ -3572,6 +3413,7 @@ +#include +#include +#include ++#include +#include +#include + @@ -3592,8 +3434,8 @@ + skb->next = skb->prev = NULL; + skb->dev = netdev_pub(wg); + -+ rcu_read_lock(); -+ sock = rcu_dereference(wg->sock4); ++ rcu_read_lock_bh(); ++ sock = rcu_dereference_bh(wg->sock4); + + if (unlikely(!sock)) { + ret = -ENONET; @@ -3606,20 +3448,22 @@ + if (!rt) { + security_sk_classify_flow(sock, flowi4_to_flowi(&fl)); + rt = ip_route_output_flow(sock_net(sock), &fl, sock); -+ if (unlikely(IS_ERR(rt) && PTR_ERR(rt) == -EINVAL && fl.saddr)) { ++ if (unlikely(endpoint->src4.s_addr && ((IS_ERR(rt) && PTR_ERR(rt) == -EINVAL) || (!IS_ERR(rt) && !inet_confirm_addr(sock_net(sock), rcu_dereference_bh(rt->dst.dev->ip_ptr), 0, fl.saddr, RT_SCOPE_HOST))))) { + endpoint->src4.s_addr = fl.saddr = 0; + if (cache) + dst_cache_reset(cache); ++ if (!IS_ERR(rt)) ++ ip_rt_put(rt); + rt = ip_route_output_flow(sock_net(sock), &fl, sock); + } + if (unlikely(IS_ERR(rt))) { + ret = PTR_ERR(rt); -+ net_dbg_ratelimited("No route to %pISpfsc, error %d\n", &endpoint->addr, ret); ++ net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n", netdev_pub(wg)->name, &endpoint->addr, ret); + goto err; + } else if (unlikely(rt->dst.dev == skb->dev)) { -+ dst_release(&rt->dst); ++ ip_rt_put(rt); + ret = -ELOOP; -+ net_dbg_ratelimited("Avoiding routing loop to %pISpfsc\n", &endpoint->addr); ++ net_dbg_ratelimited("%s: Avoiding routing loop to %pISpfsc\n", netdev_pub(wg)->name, &endpoint->addr); + goto err; + } + if (cache) @@ -3635,7 +3479,7 @@ +err: + kfree_skb(skb); +out: -+ rcu_read_unlock(); ++ rcu_read_unlock_bh(); + return ret; +} + @@ -3659,8 +3503,8 @@ + skb->next = skb->prev = NULL; + skb->dev = netdev_pub(wg); + -+ rcu_read_lock(); -+ sock = rcu_dereference(wg->sock6); ++ rcu_read_lock_bh(); ++ sock = rcu_dereference_bh(wg->sock6); + + if (unlikely(!sock)) { + ret = -ENONET; @@ -3679,12 +3523,12 @@ + } + ret = ipv6_stub->ipv6_dst_lookup(sock_net(sock), sock, &dst, &fl); + if (unlikely(ret)) { -+ net_dbg_ratelimited("No route to %pISpfsc, error %d\n", &endpoint->addr, ret); ++ net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n", netdev_pub(wg)->name, &endpoint->addr, ret); + goto err; + } else if (unlikely(dst->dev == skb->dev)) { + dst_release(dst); + ret = -ELOOP; -+ net_dbg_ratelimited("Avoiding routing loop to %pISpfsc\n", &endpoint->addr); ++ net_dbg_ratelimited("%s: Avoiding routing loop to %pISpfsc\n", netdev_pub(wg)->name, &endpoint->addr); + goto err; + } + if (cache) @@ -3701,7 +3545,7 @@ +err: + kfree_skb(skb); +out: -+ rcu_read_unlock(); ++ rcu_read_unlock_bh(); + return ret; +#else + return -EAFNOSUPPORT; @@ -3766,12 +3610,12 @@ +int socket_endpoint_from_skb(struct endpoint *endpoint, struct sk_buff *skb) +{ + memset(endpoint, 0, sizeof(struct endpoint)); -+ if (ip_hdr(skb)->version == 4) { ++ if (skb->protocol == htons(ETH_P_IP)) { + endpoint->addr4.sin_family = AF_INET; + endpoint->addr4.sin_port = udp_hdr(skb)->source; + endpoint->addr4.sin_addr.s_addr = ip_hdr(skb)->saddr; + endpoint->src4.s_addr = ip_hdr(skb)->daddr; -+ } else if (ip_hdr(skb)->version == 6) { ++ } else if (skb->protocol == htons(ETH_P_IPV6)) { + endpoint->addr6.sin6_family = AF_INET6; + endpoint->addr6.sin6_port = udp_hdr(skb)->source; + endpoint->addr6.sin6_addr = ipv6_hdr(skb)->saddr; @@ -3884,11 +3728,11 @@ + .use_udp6_rx_checksums = true, + .ipv6_v6only = true + }; ++#endif ++ mutex_lock(&wg->socket_update_lock); ++#if IS_ENABLED(CONFIG_IPV6) +retry: +#endif -+ -+ mutex_lock(&wg->socket_update_lock); -+ + if (rcu_dereference_protected(wg->sock4, lockdep_is_held(&wg->socket_update_lock)) || + rcu_dereference_protected(wg->sock6, lockdep_is_held(&wg->socket_update_lock))) { + ret = -EADDRINUSE; @@ -3897,11 +3741,10 @@ + + ret = udp_sock_create(wg->creating_net, &port4, &new4); + if (ret < 0) { -+ pr_err("Could not create IPv4 socket\n"); ++ pr_err("%s: Could not create IPv4 socket\n", netdev_pub(wg)->name); + goto out; + } + wg->incoming_port = ntohs(inet_sk(new4->sk)->inet_sport); -+ + set_sock_opts(new4); + setup_udp_tunnel_sock(wg->creating_net, new4, &cfg); + rcu_assign_pointer(wg->sock4, new4->sk); @@ -3918,7 +3761,7 @@ + goto retry; + if (!port4.local_udp_port) + wg->incoming_port = 0; -+ pr_err("Could not create IPv6 socket\n"); ++ pr_err("%s: Could not create IPv6 socket\n", netdev_pub(wg)->name); + goto out; + } + set_sock_opts(new6); @@ -3940,14 +3783,14 @@ + rcu_assign_pointer(wg->sock4, NULL); + rcu_assign_pointer(wg->sock6, NULL); + mutex_unlock(&wg->socket_update_lock); -+ synchronize_rcu(); ++ synchronize_rcu_bh(); + synchronize_net(); + sock_free(old4); + sock_free(old6); +} ---- /dev/null -+++ b/net/wireguard/timers.c -@@ -0,0 +1,178 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/timers.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,179 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#include "timers.h" @@ -3956,7 +3799,7 @@ +#include "packets.h" + +/* -+ * Timer for retransmitting the handshake if we don't hear back after `REKEY_TIMEOUT` ms ++ * Timer for retransmitting the handshake if we don't hear back after `REKEY_TIMEOUT + jitter` ms + * Timer for sending empty packet if we have received a packet but after have not sent one for `KEEPALIVE_TIMEOUT` ms + * Timer for initiating new handshake if we have sent a packet but after have not received one (even empty) for `(KEEPALIVE_TIMEOUT + REKEY_TIMEOUT)` ms + * Timer for zeroing out all ephemeral keys after `(REJECT_AFTER_TIME * 3)` ms if no new keys have been received @@ -3977,25 +3820,26 @@ +static void expired_retransmit_handshake(unsigned long ptr) +{ + peer_get_from_ptr(ptr); -+ pr_debug("Handshake for peer %Lu (%pISpfsc) did not complete after %d seconds, retrying\n", peer->internal_id, &peer->endpoint.addr, REKEY_TIMEOUT / HZ); + if (peer->timer_handshake_attempts > MAX_TIMER_HANDSHAKES) { ++ pr_debug("%s: Handshake for peer %Lu (%pISpfsc) did not complete after %d attempts, giving up\n", netdev_pub(peer->device)->name, peer->internal_id, &peer->endpoint.addr, MAX_TIMER_HANDSHAKES + 2); ++ + del_timer(&peer->timer_send_keepalive); + /* We remove all existing packets and don't try again, + * if we try unsuccessfully for too long to make a handshake. */ + skb_queue_purge(&peer->tx_packet_queue); + /* We set a timer for destroying any residue that might be left + * of a partial exchange. */ -+ if (likely(peer->timers_enabled)) ++ if (likely(peer->timers_enabled) && !timer_pending(&peer->timer_kill_ephemerals)) + mod_timer(&peer->timer_kill_ephemerals, jiffies + (REJECT_AFTER_TIME * 3)); -+ goto out; ++ } else { ++ ++peer->timer_handshake_attempts; ++ pr_debug("%s: Handshake for peer %Lu (%pISpfsc) did not complete after %d seconds, retrying (try %d)\n", netdev_pub(peer->device)->name, peer->internal_id, &peer->endpoint.addr, REKEY_TIMEOUT / HZ, peer->timer_handshake_attempts + 1); ++ ++ /* We clear the endpoint address src address, in case this is the cause of trouble. */ ++ socket_clear_peer_endpoint_src(peer); ++ ++ packet_queue_handshake_initiation(peer, true); + } -+ -+ /* We clear the endpoint address src address, in case this is the cause of trouble. */ -+ socket_clear_peer_endpoint_src(peer); -+ -+ packet_queue_handshake_initiation(peer); -+ ++peer->timer_handshake_attempts; -+out: + peer_put(peer); +} + @@ -4014,23 +3858,23 @@ +static void expired_new_handshake(unsigned long ptr) +{ + peer_get_from_ptr(ptr); -+ pr_debug("Retrying handshake with peer %Lu (%pISpfsc) because we stopped hearing back after %d seconds\n", peer->internal_id, &peer->endpoint.addr, (KEEPALIVE_TIMEOUT + REKEY_TIMEOUT) / HZ); ++ pr_debug("%s: Retrying handshake with peer %Lu (%pISpfsc) because we stopped hearing back after %d seconds\n", netdev_pub(peer->device)->name, peer->internal_id, &peer->endpoint.addr, (KEEPALIVE_TIMEOUT + REKEY_TIMEOUT) / HZ); + /* We clear the endpoint address src address, in case this is the cause of trouble. */ + socket_clear_peer_endpoint_src(peer); -+ packet_queue_handshake_initiation(peer); ++ packet_queue_handshake_initiation(peer, false); + peer_put(peer); +} + +static void expired_kill_ephemerals(unsigned long ptr) +{ + peer_get_from_ptr(ptr); -+ if (!queue_work(peer->device->workqueue, &peer->clear_peer_work)) /* Takes our reference. */ ++ if (!queue_work(peer->device->peer_wq, &peer->clear_peer_work)) /* Takes our reference. */ + peer_put(peer); /* If the work was already on the queue, we want to drop the extra reference */ +} +static void queued_expired_kill_ephemerals(struct work_struct *work) +{ + struct wireguard_peer *peer = container_of(work, struct wireguard_peer, clear_peer_work); -+ pr_debug("Zeroing out all keys for peer %Lu (%pISpfsc), since we haven't received a new one in %d seconds\n", peer->internal_id, &peer->endpoint.addr, (REJECT_AFTER_TIME * 3) / HZ); ++ pr_debug("%s: Zeroing out all keys for peer %Lu (%pISpfsc), since we haven't received a new one in %d seconds\n", netdev_pub(peer->device)->name, peer->internal_id, &peer->endpoint.addr, (REJECT_AFTER_TIME * 3) / HZ); + noise_handshake_clear(&peer->handshake); + noise_keypairs_clear(&peer->keypairs); + peer_put(peer); @@ -4073,18 +3917,19 @@ +/* Should be called after a handshake initiation message is sent. */ +void timers_handshake_initiated(struct wireguard_peer *peer) +{ -+ if (likely(peer->timers_enabled)) ++ if (likely(peer->timers_enabled)) { + del_timer(&peer->timer_send_keepalive); -+ if (likely(peer->timers_enabled)) + mod_timer(&peer->timer_retransmit_handshake, slack_time(jiffies + REKEY_TIMEOUT + prandom_u32_max(REKEY_TIMEOUT_JITTER_MAX))); ++ } +} + -+/* Should be called after a handshake response message is received and processed. */ ++/* Should be called after a handshake response message is received and processed or when getting key confirmation via the first data message. */ +void timers_handshake_complete(struct wireguard_peer *peer) +{ + if (likely(peer->timers_enabled)) + del_timer(&peer->timer_retransmit_handshake); + peer->timer_handshake_attempts = 0; ++ do_gettimeofday(&peer->walltime_last_handshake); +} + +/* Should be called after an ephemeral key is created, which is before sending a handshake response or after receiving a handshake response. */ @@ -4092,10 +3937,9 @@ +{ + if (likely(peer->timers_enabled)) + mod_timer(&peer->timer_kill_ephemerals, jiffies + (REJECT_AFTER_TIME * 3)); -+ do_gettimeofday(&peer->walltime_last_handshake); +} + -+/* Should be called before an packet with authentication -- data, keepalive, either handshake -- is sent, or after one is received. */ ++/* Should be called before a packet with authentication -- data, keepalive, either handshake -- is sent, or after one is received. */ +void timers_any_authenticated_packet_traversal(struct wireguard_peer *peer) +{ + if (peer->persistent_keepalive_interval && likely(peer->timers_enabled)) @@ -4126,8 +3970,8 @@ + del_timer_sync(&peer->timer_persistent_keepalive); + flush_work(&peer->clear_peer_work); +} ---- /dev/null -+++ b/net/wireguard/config.h +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/config.h 2017-07-06 18:17:33.000000000 +0200 @@ -0,0 +1,11 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + @@ -4140,8 +3984,8 @@ +int config_set_device(struct wireguard_device *wg, void __user *udevice); + +#endif ---- /dev/null -+++ b/net/wireguard/cookie.h +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/cookie.h 2017-07-06 18:17:33.000000000 +0200 @@ -0,0 +1,51 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + @@ -4149,7 +3993,6 @@ +#define WGCOOKIE_H + +#include "messages.h" -+#include "ratelimiter.h" +#include + +struct wireguard_peer; @@ -4159,9 +4002,9 @@ +struct cookie_checker { + u8 secret[NOISE_HASH_LEN]; + u8 cookie_encryption_key[NOISE_SYMMETRIC_KEY_LEN]; ++ u8 message_mac1_key[NOISE_SYMMETRIC_KEY_LEN]; + u64 secret_birthdate; + struct rw_semaphore secret_lock; -+ struct ratelimiter ratelimiter; + struct wireguard_device *device; +}; + @@ -4172,6 +4015,7 @@ + bool have_sent_mac1; + u8 last_mac1_sent[COOKIE_LEN]; + u8 cookie_decryption_key[NOISE_SYMMETRIC_KEY_LEN]; ++ u8 message_mac1_key[NOISE_SYMMETRIC_KEY_LEN]; + struct rw_semaphore lock; +}; + @@ -4182,21 +4026,21 @@ + VALID_MAC_WITH_COOKIE +}; + -+int cookie_checker_init(struct cookie_checker *checker, struct wireguard_device *wg); -+void cookie_checker_uninit(struct cookie_checker *checker); -+void cookie_checker_precompute_keys(struct cookie_checker *checker, struct wireguard_peer *peer); ++void cookie_checker_init(struct cookie_checker *checker, struct wireguard_device *wg); ++void cookie_checker_precompute_device_keys(struct cookie_checker *checker); ++void cookie_checker_precompute_peer_keys(struct wireguard_peer *peer); +void cookie_init(struct cookie *cookie); + -+enum cookie_mac_state cookie_validate_packet(struct cookie_checker *checker, struct sk_buff *skb, void *data_start, size_t data_len, bool check_cookie); ++enum cookie_mac_state cookie_validate_packet(struct cookie_checker *checker, struct sk_buff *skb, bool check_cookie); +void cookie_add_mac_to_packet(void *message, size_t len, struct wireguard_peer *peer); + -+void cookie_message_create(struct message_handshake_cookie *src, struct sk_buff *skb, void *data_start, size_t data_len, __le32 index, struct cookie_checker *checker); ++void cookie_message_create(struct message_handshake_cookie *src, struct sk_buff *skb, __le32 index, struct cookie_checker *checker); +void cookie_message_consume(struct message_handshake_cookie *src, struct wireguard_device *wg); + +#endif ---- /dev/null -+++ b/net/wireguard/device.h -@@ -0,0 +1,45 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/device.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,51 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#ifndef WGDEVICE_H @@ -4213,19 +4057,24 @@ +#include +#include +#include -+#include ++ ++struct wireguard_device; ++struct handshake_worker { ++ struct wireguard_device *wg; ++ struct work_struct work; ++}; + +struct wireguard_device { ++ struct list_head device_list; + struct sock __rcu *sock4, *sock6; + u16 incoming_port; + u32 fwmark; + struct net *creating_net; -+ struct workqueue_struct *workqueue; -+ struct workqueue_struct *parallelqueue; -+ struct padata_instance *parallel_send, *parallel_receive; + struct noise_static_identity static_identity; ++ struct workqueue_struct *incoming_handshake_wq, *peer_wq; + struct sk_buff_head incoming_handshakes; -+ struct work_struct incoming_handshakes_work; ++ atomic_t incoming_handshake_seqnr; ++ struct handshake_worker __percpu *incoming_handshakes_worker; + struct cookie_checker cookie_checker; + struct pubkey_hashtable peer_hashtable; + struct index_hashtable index_hashtable; @@ -4233,8 +4082,9 @@ + struct list_head peer_list; + struct mutex device_update_lock; + struct mutex socket_update_lock; -+#ifdef CONFIG_PM_SLEEP -+ struct notifier_block clear_peers_on_suspend; ++#ifdef CONFIG_WIREGUARD_PARALLEL ++ struct workqueue_struct *crypt_wq; ++ struct padata_instance *encrypt_pd, *decrypt_pd; +#endif +}; + @@ -4242,9 +4092,9 @@ +void device_uninit(void); + +#endif ---- /dev/null -+++ b/net/wireguard/hashtables.h -@@ -0,0 +1,48 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/hashtables.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,47 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#ifndef HASHTABLES_H @@ -4271,7 +4121,6 @@ + +struct index_hashtable { + DECLARE_HASHTABLE(hashtable, 10); -+ siphash_key_t key; + spinlock_t lock; +}; + @@ -4288,14 +4137,14 @@ +}; +void index_hashtable_init(struct index_hashtable *table); +__le32 index_hashtable_insert(struct index_hashtable *table, struct index_hashtable_entry *entry); -+void index_hashtable_replace(struct index_hashtable *table, struct index_hashtable_entry *old, struct index_hashtable_entry *new); ++bool index_hashtable_replace(struct index_hashtable *table, struct index_hashtable_entry *old, struct index_hashtable_entry *new); +void index_hashtable_remove(struct index_hashtable *table, struct index_hashtable_entry *entry); +struct index_hashtable_entry *index_hashtable_lookup(struct index_hashtable *table, const enum index_hashtable_type type_mask, const __le32 index); + +#endif ---- /dev/null -+++ b/net/wireguard/messages.h -@@ -0,0 +1,143 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/messages.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,150 @@ +/* + * Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. + * @@ -4311,6 +4160,7 @@ + +#include +#include ++#include + +enum noise_lengths { + NOISE_PUBLIC_KEY_LEN = CURVE25519_POINT_SIZE, @@ -4347,7 +4197,6 @@ + KEEPALIVE_TIMEOUT = 10 * HZ, + MAX_TIMER_HANDSHAKES = (90 * HZ) / REKEY_TIMEOUT, + MAX_QUEUED_INCOMING_HANDSHAKES = 4096, -+ MAX_BURST_INCOMING_HANDSHAKES = 16, + MAX_QUEUED_OUTGOING_PACKETS = 1024 +}; + @@ -4422,26 +4271,33 @@ + HANDSHAKE_DSCP = 0b10001000 /* AF41, plus 00 ECN */ +}; + -+static inline enum message_type message_determine_type(void *src, size_t src_len) ++static const unsigned int message_header_sizes[MESSAGE_TOTAL] = { ++ [MESSAGE_HANDSHAKE_INITIATION] = sizeof(struct message_handshake_initiation), ++ [MESSAGE_HANDSHAKE_RESPONSE] = sizeof(struct message_handshake_response), ++ [MESSAGE_HANDSHAKE_COOKIE] = sizeof(struct message_handshake_cookie), ++ [MESSAGE_DATA] = sizeof(struct message_data) ++}; ++ ++static inline enum message_type message_determine_type(struct sk_buff *skb) +{ -+ struct message_header *header = src; -+ if (unlikely(src_len < sizeof(struct message_header))) ++ struct message_header *header = (struct message_header *)skb->data; ++ if (unlikely(skb->len < sizeof(struct message_header))) + return MESSAGE_INVALID; -+ if (header->type == cpu_to_le32(MESSAGE_DATA) && src_len >= MESSAGE_MINIMUM_LENGTH) ++ if (header->type == cpu_to_le32(MESSAGE_DATA) && skb->len >= MESSAGE_MINIMUM_LENGTH) + return MESSAGE_DATA; -+ if (header->type == cpu_to_le32(MESSAGE_HANDSHAKE_INITIATION) && src_len == sizeof(struct message_handshake_initiation)) ++ if (header->type == cpu_to_le32(MESSAGE_HANDSHAKE_INITIATION) && skb->len == sizeof(struct message_handshake_initiation)) + return MESSAGE_HANDSHAKE_INITIATION; -+ if (header->type == cpu_to_le32(MESSAGE_HANDSHAKE_RESPONSE) && src_len == sizeof(struct message_handshake_response)) ++ if (header->type == cpu_to_le32(MESSAGE_HANDSHAKE_RESPONSE) && skb->len == sizeof(struct message_handshake_response)) + return MESSAGE_HANDSHAKE_RESPONSE; -+ if (header->type == cpu_to_le32(MESSAGE_HANDSHAKE_COOKIE) && src_len == sizeof(struct message_handshake_cookie)) ++ if (header->type == cpu_to_le32(MESSAGE_HANDSHAKE_COOKIE) && skb->len == sizeof(struct message_handshake_cookie)) + return MESSAGE_HANDSHAKE_COOKIE; + return MESSAGE_INVALID; +} + +#endif ---- /dev/null -+++ b/net/wireguard/noise.h -@@ -0,0 +1,123 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/noise.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,122 @@ +/* + * Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. + * @@ -4497,10 +4353,9 @@ +}; + +struct noise_static_identity { -+ bool has_identity, has_psk; ++ bool has_identity; + u8 static_public[NOISE_PUBLIC_KEY_LEN]; + u8 static_private[NOISE_PUBLIC_KEY_LEN]; -+ u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN]; + struct rw_semaphore lock; +}; + @@ -4520,20 +4375,20 @@ + + struct noise_static_identity *static_identity; + -+ u8 ephemeral_public[NOISE_PUBLIC_KEY_LEN]; + u8 ephemeral_private[NOISE_PUBLIC_KEY_LEN]; -+ + u8 remote_static[NOISE_PUBLIC_KEY_LEN]; + u8 remote_ephemeral[NOISE_PUBLIC_KEY_LEN]; ++ u8 precomputed_static_static[NOISE_PUBLIC_KEY_LEN]; ++ ++ u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN]; + -+ u8 key[NOISE_SYMMETRIC_KEY_LEN]; + u8 hash[NOISE_HASH_LEN]; + u8 chaining_key[NOISE_HASH_LEN]; + + u8 latest_timestamp[NOISE_TIMESTAMP_LEN]; + __le32 remote_index; + -+ /* Protects all members except the immutable (after noise_peer_init): remote_static, static_identity */ ++ /* Protects all members except the immutable (after noise_handshake_init): remote_static, precomputed_static_static, static_identity */ + struct rw_semaphore lock; +}; + @@ -4546,7 +4401,7 @@ +struct message_handshake_cookie; + +void noise_init(void); -+void noise_handshake_init(struct noise_handshake *handshake, struct noise_static_identity *static_identity, const u8 peer_public_key[NOISE_PUBLIC_KEY_LEN], struct wireguard_peer *peer); ++bool noise_handshake_init(struct noise_handshake *handshake, struct noise_static_identity *static_identity, const u8 peer_public_key[NOISE_PUBLIC_KEY_LEN], const u8 peer_preshared_key[NOISE_SYMMETRIC_KEY_LEN], struct wireguard_peer *peer); +void noise_handshake_clear(struct noise_handshake *handshake); +void noise_keypair_put(struct noise_keypair *keypair); +struct noise_keypair *noise_keypair_get(struct noise_keypair *keypair); @@ -4554,7 +4409,7 @@ +bool noise_received_with_keypair(struct noise_keypairs *keypairs, struct noise_keypair *received_keypair); + +void noise_set_static_identity_private_key(struct noise_static_identity *static_identity, const u8 private_key[NOISE_PUBLIC_KEY_LEN]); -+void noise_set_static_identity_preshared_key(struct noise_static_identity *static_identity, const u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN]); ++bool noise_precompute_static_static(struct wireguard_peer *peer); + +bool noise_handshake_create_initiation(struct message_handshake_initiation *dst, struct noise_handshake *handshake); +struct wireguard_peer *noise_handshake_consume_initiation(struct message_handshake_initiation *src, struct wireguard_device *wg); @@ -4565,9 +4420,9 @@ +bool noise_handshake_begin_session(struct noise_handshake *handshake, struct noise_keypairs *keypairs, bool i_am_the_initiator); + +#endif ---- /dev/null -+++ b/net/wireguard/packets.h -@@ -0,0 +1,44 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/packets.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,63 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#ifndef PACKETS_H @@ -4579,28 +4434,47 @@ + +#include +#include ++#include ++#include ++#include + +struct wireguard_device; +struct wireguard_peer; +struct sk_buff; + ++struct packet_cb { ++ u64 nonce; ++ u8 ds; ++}; ++#define PACKET_CB(skb) ((struct packet_cb *)skb->cb) ++ +/* receive.c */ +void packet_receive(struct wireguard_device *wg, struct sk_buff *skb); +void packet_process_queued_handshake_packets(struct work_struct *work); ++void packet_consume_data_done(struct sk_buff *skb, struct wireguard_peer *peer, struct endpoint *endpoint, bool used_new_key); + +/* send.c */ +void packet_send_queue(struct wireguard_peer *peer); +void packet_send_keepalive(struct wireguard_peer *peer); -+void packet_queue_handshake_initiation(struct wireguard_peer *peer); ++void packet_queue_handshake_initiation(struct wireguard_peer *peer, bool is_retry); +void packet_send_queued_handshakes(struct work_struct *work); +void packet_send_handshake_response(struct wireguard_peer *peer); -+void packet_send_handshake_cookie(struct wireguard_device *wg, struct sk_buff *initiating_skb, void *data, size_t data_len, __le32 sender_index); ++void packet_send_handshake_cookie(struct wireguard_device *wg, struct sk_buff *initiating_skb, __le32 sender_index); ++void packet_create_data_done(struct sk_buff_head *queue, struct wireguard_peer *peer); + +/* data.c */ -+typedef void (*packet_create_data_callback_t)(struct sk_buff_head *, struct wireguard_peer *); -+typedef void (*packet_consume_data_callback_t)(struct sk_buff *skb, struct wireguard_peer *, struct endpoint *, bool used_new_key, int err); -+int packet_create_data(struct sk_buff_head *queue, struct wireguard_peer *peer, packet_create_data_callback_t callback); -+void packet_consume_data(struct sk_buff *skb, size_t offset, struct wireguard_device *wg, packet_consume_data_callback_t callback); ++int packet_create_data(struct sk_buff_head *queue, struct wireguard_peer *peer); ++void packet_consume_data(struct sk_buff *skb, struct wireguard_device *wg); ++ ++/* Returns either the correct skb->protocol value, or 0 if invalid. */ ++static inline __be16 skb_examine_untrusted_ip_hdr(struct sk_buff *skb) ++{ ++ if (skb_network_header(skb) >= skb->head && (skb_network_header(skb) + sizeof(struct iphdr)) <= skb_tail_pointer(skb) && ip_hdr(skb)->version == 4) ++ return htons(ETH_P_IP); ++ if (skb_network_header(skb) >= skb->head && (skb_network_header(skb) + sizeof(struct ipv6hdr)) <= skb_tail_pointer(skb) && ipv6_hdr(skb)->version == 6) ++ return htons(ETH_P_IPV6); ++ return 0; ++} + +#ifdef CONFIG_WIREGUARD_PARALLEL +int packet_init_data_caches(void); @@ -4612,9 +4486,9 @@ +#endif + +#endif ---- /dev/null -+++ b/net/wireguard/peer.h -@@ -0,0 +1,75 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/peer.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,100 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#ifndef PEER_H @@ -4673,7 +4547,7 @@ +#endif +}; + -+struct wireguard_peer *peer_create(struct wireguard_device *wg, const u8 public_key[NOISE_PUBLIC_KEY_LEN]); ++struct wireguard_peer *peer_create(struct wireguard_device *wg, const u8 public_key[NOISE_PUBLIC_KEY_LEN], const u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN]); + +struct wireguard_peer *peer_get(struct wireguard_peer *peer); +struct wireguard_peer *peer_rcu_get(struct wireguard_peer *peer); @@ -4684,44 +4558,59 @@ + +struct wireguard_peer *peer_lookup_by_index(struct wireguard_device *wg, u32 index); + -+int peer_for_each_unlocked(struct wireguard_device *wg, int (*fn)(struct wireguard_peer *peer, void *ctx), void *data); -+int peer_for_each(struct wireguard_device *wg, int (*fn)(struct wireguard_peer *peer, void *ctx), void *data); -+ +unsigned int peer_total_count(struct wireguard_device *wg); + ++/* This is a macro iterator of essentially this: ++ * ++ * if (__should_lock) ++ * mutex_lock(&(__wg)->device_update_lock); ++ * else ++ * lockdep_assert_held(&(__wg)->device_update_lock) ++ * list_for_each_entry_safe (__peer, __temp, &(__wg)->peer_list, peer_list) { ++ * __peer = peer_rcu_get(__peer); ++ * if (!__peer) ++ * continue; ++ * ITERATOR_BODY ++ * peer_put(__peer); ++ * } ++ * if (__should_lock) ++ * mutex_unlock(&(__wg)->device_update_lock); ++ * ++ * While it's really ugly to look at, the code gcc produces from it is actually perfect. ++ */ ++#define pfe_label(n) __PASTE(__PASTE(pfe_label_, n ## _), __LINE__) ++#define peer_for_each(__wg, __peer, __temp, __should_lock) \ ++ if (1) { if (__should_lock) mutex_lock(&(__wg)->device_update_lock); else lockdep_assert_held(&(__wg)->device_update_lock); goto pfe_label(1); } else pfe_label(1): \ ++ if (1) goto pfe_label(2); else while (1) if (1) { if (__should_lock) mutex_unlock(&(__wg)->device_update_lock); break; } else pfe_label(2): \ ++ list_for_each_entry_safe (__peer, __temp, &(__wg)->peer_list, peer_list) \ ++ if (0) pfe_label(3): break; else \ ++ if (0); else for (__peer = peer_rcu_get(peer); __peer;) if (1) { goto pfe_label(4); pfe_label(5): break; } else while (1) if (1) goto pfe_label(5); else pfe_label(4): \ ++ if (1) { goto pfe_label(6); pfe_label(7):; } else while (1) if (1) goto pfe_label(3); else while (1) if (1) goto pfe_label(7); else pfe_label(6): \ ++ if (1) { goto pfe_label(8); pfe_label(9): peer_put(__peer); break; pfe_label(10): peer_put(__peer); } else while (1) if (1) goto pfe_label(9); else while (1) if (1) goto pfe_label(10); else pfe_label(8): ++ +#endif ---- /dev/null -+++ b/net/wireguard/ratelimiter.h -@@ -0,0 +1,26 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/ratelimiter.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,16 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#ifndef RATELIMITER_H +#define RATELIMITER_H + -+#include ++#include + -+struct wireguard_device; -+struct sk_buff; ++int ratelimiter_init(void); ++void ratelimiter_uninit(void); ++bool ratelimiter_allow(struct sk_buff *skb, struct net *net); + -+struct ratelimiter { -+ struct net *net; -+ struct xt_hashlimit_mtinfo1 v4_info; -+#if IS_ENABLED(CONFIG_IPV6) -+ struct xt_hashlimit_mtinfo1 v6_info; ++#ifdef DEBUG ++bool ratelimiter_selftest(void); +#endif -+}; -+ -+int ratelimiter_init(struct ratelimiter *ratelimiter, struct wireguard_device *wg); -+void ratelimiter_uninit(struct ratelimiter *ratelimiter); -+bool ratelimiter_allow(struct ratelimiter *ratelimiter, struct sk_buff *skb); -+ -+int ratelimiter_module_init(void); -+void ratelimiter_module_deinit(void); + +#endif ---- /dev/null -+++ b/net/wireguard/routingtable.h -@@ -0,0 +1,40 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/routingtable.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,36 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#ifndef ROUTINGTABLE_H @@ -4744,16 +4633,12 @@ +void routing_table_free(struct routing_table *table); +int routing_table_insert_v4(struct routing_table *table, const struct in_addr *ip, u8 cidr, struct wireguard_peer *peer); +int routing_table_insert_v6(struct routing_table *table, const struct in6_addr *ip, u8 cidr, struct wireguard_peer *peer); -+int routing_table_remove_v4(struct routing_table *table, const struct in_addr *ip, u8 cidr); -+int routing_table_remove_v6(struct routing_table *table, const struct in6_addr *ip, u8 cidr); -+int routing_table_remove_by_peer(struct routing_table *table, struct wireguard_peer *peer); -+int routing_table_walk_ips(struct routing_table *table, void *ctx, int (*func)(void *ctx, struct wireguard_peer *peer, union nf_inet_addr ip, u8 cidr, int family)); ++void routing_table_remove_by_peer(struct routing_table *table, struct wireguard_peer *peer); ++size_t routing_table_count_nodes(struct routing_table *table); +int routing_table_walk_ips_by_peer(struct routing_table *table, void *ctx, struct wireguard_peer *peer, int (*func)(void *ctx, union nf_inet_addr ip, u8 cidr, int family)); +int routing_table_walk_ips_by_peer_sleepable(struct routing_table *table, void *ctx, struct wireguard_peer *peer, int (*func)(void *ctx, union nf_inet_addr ip, u8 cidr, int family)); + +/* These return a strong reference to a peer: */ -+struct wireguard_peer *routing_table_lookup_v4(struct routing_table *table, const struct in_addr *ip); -+struct wireguard_peer *routing_table_lookup_v6(struct routing_table *table, const struct in6_addr *ip); +struct wireguard_peer *routing_table_lookup_dst(struct routing_table *table, struct sk_buff *skb); +struct wireguard_peer *routing_table_lookup_src(struct routing_table *table, struct sk_buff *skb); + @@ -4762,8 +4647,8 @@ +#endif + +#endif ---- /dev/null -+++ b/net/wireguard/socket.h +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/socket.h 2017-07-06 18:17:33.000000000 +0200 @@ -0,0 +1,24 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + @@ -4789,8 +4674,8 @@ +void socket_clear_peer_endpoint_src(struct wireguard_peer *peer); + +#endif ---- /dev/null -+++ b/net/wireguard/timers.h +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/timers.h 2017-07-06 18:17:33.000000000 +0200 @@ -0,0 +1,19 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + @@ -4811,9 +4696,9 @@ +void timers_any_authenticated_packet_traversal(struct wireguard_peer *peer); + +#endif ---- /dev/null -+++ b/net/wireguard/uapi.h -@@ -0,0 +1,159 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/uapi.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,166 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. + * + * Userspace API for WireGuard @@ -4868,9 +4753,9 @@ + * If `wgpeer->flags & WGPEER_REMOVE_ME` is true, the peer identified by `wgpeer->public_key` is removed. + * If `wgpeer->flags & WGPEER_REPLACE_IPMASKS` is true, removes all ipmasks before adding new ones. + * If `wgdevice->private_key` is filled with zeros, no action is taken on the private key. -+ * If `wgdevice->preshared_key` is filled with zeros, no action is taken on the pre-shared key. ++ * If `wgdevice->preshared_key` is filled with zeros, no action is taken on the preshared key. + * If `wgdevice->flags & WGDEVICE_REMOVE_PRIVATE_KEY` is true, the private key is removed. -+ * If `wgdevice->flags & WGDEVICE_REMOVE_PRESHARED_KEY` is true, the pre-shared key is removed. ++ * If `wgdevice->flags & WGDEVICE_REMOVE_PRESHARED_KEY` is true, the preshared key is removed. + * + * Returns 0 on success, or -errno if an error occurred. + */ @@ -4915,10 +4800,13 @@ + +enum { + WGPEER_REMOVE_ME = (1 << 0), -+ WGPEER_REPLACE_IPMASKS = (1 << 1) ++ WGPEER_REPLACE_IPMASKS = (1 << 1), ++ WGPEER_REMOVE_PRESHARED_KEY = (1 << 2) +}; ++ +struct wgpeer { + __u8 public_key[WG_KEY_LEN]; /* Get/Set */ ++ __u8 preshared_key[WG_KEY_LEN]; /* Get/Set */ + __u32 flags; /* Set */ + + union { @@ -4937,16 +4825,20 @@ +enum { + WGDEVICE_REPLACE_PEERS = (1 << 0), + WGDEVICE_REMOVE_PRIVATE_KEY = (1 << 1), -+ WGDEVICE_REMOVE_PRESHARED_KEY = (1 << 2), -+ WGDEVICE_REMOVE_FWMARK = (1 << 3) ++ WGDEVICE_REMOVE_FWMARK = (1 << 2) +}; ++ ++enum { ++ WG_API_VERSION_MAGIC = 0xbeef0002 ++}; ++ +struct wgdevice { ++ __u32 version_magic; /* Must be value of WG_API_VERSION_MAGIC */ + char interface[IFNAMSIZ]; /* Get */ + __u32 flags; /* Set */ + + __u8 public_key[WG_KEY_LEN]; /* Get */ + __u8 private_key[WG_KEY_LEN]; /* Get/Set */ -+ __u8 preshared_key[WG_KEY_LEN]; /* Get/Set */ + __u32 fwmark; /* Get/Set */ + __u16 port; /* Get/Set */ + @@ -4973,17 +4865,17 @@ + ++(__i), (__ipmask) = (struct wgipmask *)((uint8_t *)(__ipmask) + sizeof(struct wgipmask))) + +#endif ---- /dev/null -+++ b/net/wireguard/version.h +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/version.h 2017-07-06 18:17:33.000000000 +0200 @@ -0,0 +1 @@ -+#define WIREGUARD_VERSION "0.0.20170223" ---- /dev/null -+++ b/net/wireguard/selftest/blake2s.h ++#define WIREGUARD_VERSION "0.0.20170706" +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/selftest/blake2s.h 2017-07-06 18:17:33.000000000 +0200 @@ -0,0 +1,556 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#ifdef DEBUG -+static const u8 blake2s_testvecs[][BLAKE2S_OUTBYTES] = { ++static const u8 blake2s_testvecs[][BLAKE2S_OUTBYTES] __initconst = { + { 0x69, 0x21, 0x7A, 0x30, 0x79, 0x90, 0x80, 0x94, 0xE1, 0x11, 0x21, 0xD0, 0x42, 0x35, 0x4A, 0x7C, 0x1F, 0x55, 0xB6, 0x48, 0x2C, 0xA1, 0xA5, 0x1E, 0x1B, 0x25, 0x0D, 0xFD, 0x1E, 0xD0, 0xEE, 0xF9 }, + { 0xE3, 0x4D, 0x74, 0xDB, 0xAF, 0x4F, 0xF4, 0xC6, 0xAB, 0xD8, 0x71, 0xCC, 0x22, 0x04, 0x51, 0xD2, 0xEA, 0x26, 0x48, 0x84, 0x6C, 0x77, 0x57, 0xFB, 0xAA, 0xC8, 0x2F, 0xE5, 0x1A, 0xD6, 0x4B, 0xEA }, + { 0xDD, 0xAD, 0x9A, 0xB1, 0x5D, 0xAC, 0x45, 0x49, 0xBA, 0x42, 0xF4, 0x9D, 0x26, 0x24, 0x96, 0xBE, 0xF6, 0xC0, 0xBA, 0xE1, 0xDD, 0x34, 0x2A, 0x88, 0x08, 0xF8, 0xEA, 0x26, 0x7C, 0x6E, 0x21, 0x0C }, @@ -5242,7 +5134,7 @@ + { 0xF0, 0x3F, 0x57, 0x89, 0xD3, 0x33, 0x6B, 0x80, 0xD0, 0x02, 0xD5, 0x9F, 0xDF, 0x91, 0x8B, 0xDB, 0x77, 0x5B, 0x00, 0x95, 0x6E, 0xD5, 0x52, 0x8E, 0x86, 0xAA, 0x99, 0x4A, 0xCB, 0x38, 0xFE, 0x2D } +}; + -+static const u8 blake2s_keyed_testvecs[][BLAKE2S_OUTBYTES] = { ++static const u8 blake2s_keyed_testvecs[][BLAKE2S_OUTBYTES] __initconst = { + { 0x48, 0xA8, 0x99, 0x7D, 0xA4, 0x07, 0x87, 0x6B, 0x3D, 0x79, 0xC0, 0xD9, 0x23, 0x25, 0xAD, 0x3B, 0x89, 0xCB, 0xB7, 0x54, 0xD8, 0x6A, 0xB7, 0x1A, 0xEE, 0x04, 0x7A, 0xD3, 0x45, 0xFD, 0x2C, 0x49 }, + { 0x40, 0xD1, 0x5F, 0xEE, 0x7C, 0x32, 0x88, 0x30, 0x16, 0x6A, 0xC3, 0xF9, 0x18, 0x65, 0x0F, 0x80, 0x7E, 0x7E, 0x01, 0xE1, 0x77, 0x25, 0x8C, 0xDC, 0x0A, 0x39, 0xB1, 0x1F, 0x59, 0x80, 0x66, 0xF1 }, + { 0x6B, 0xB7, 0x13, 0x00, 0x64, 0x4C, 0xD3, 0x99, 0x1B, 0x26, 0xCC, 0xD4, 0xD2, 0x74, 0xAC, 0xD1, 0xAD, 0xEA, 0xB8, 0xB1, 0xD7, 0x91, 0x45, 0x46, 0xC1, 0x19, 0x8B, 0xBE, 0x9F, 0xC9, 0xD8, 0x03 }, @@ -5501,7 +5393,7 @@ + { 0x3F, 0xB7, 0x35, 0x06, 0x1A, 0xBC, 0x51, 0x9D, 0xFE, 0x97, 0x9E, 0x54, 0xC1, 0xEE, 0x5B, 0xFA, 0xD0, 0xA9, 0xD8, 0x58, 0xB3, 0x31, 0x5B, 0xAD, 0x34, 0xBD, 0xE9, 0x99, 0xEF, 0xD7, 0x24, 0xDD } +}; + -+bool blake2s_selftest(void) ++bool __init blake2s_selftest(void) +{ + u8 key[BLAKE2S_KEYBYTES]; + u8 buf[ARRAY_SIZE(blake2s_testvecs)]; @@ -5536,8 +5428,8 @@ + return success; +} +#endif ---- /dev/null -+++ b/net/wireguard/selftest/chacha20poly1305.h +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/selftest/chacha20poly1305.h 2017-07-06 18:17:33.000000000 +0200 @@ -0,0 +1,89 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + @@ -5547,7 +5439,7 @@ + u8 *key, *nonce, *assoc, *input, *result; + size_t alen, ilen; +}; -+static const struct chacha20poly1305_testvec chacha20poly1305_enc_vectors[] = { { ++static const struct chacha20poly1305_testvec chacha20poly1305_enc_vectors[] __initconst = { { + .key = "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a\xf3\x33\x88\x86\x04\xf6\xb5\xf0\x47\x39\x17\xc1\x40\x2b\x80\x09\x9d\xca\x5c\xbc\x20\x70\x75\xc0", + .nonce = "\x01\x02\x03\x04\x05\x06\x07\x08", + .assoc = "\xf3\x33\x88\x86\x00\x00\x00\x00\x00\x00\x4e\x91", @@ -5556,7 +5448,7 @@ + .ilen = 265, + .result = "\x64\xa0\x86\x15\x75\x86\x1a\xf4\x60\xf0\x62\xc7\x9b\xe6\x43\xbd\x5e\x80\x5c\xfd\x34\x5c\xf3\x89\xf1\x08\x67\x0a\xc7\x6c\x8c\xb2\x4c\x6c\xfc\x18\x75\x5d\x43\xee\xa0\x9e\xe9\x4e\x38\x2d\x26\xb0\xbd\xb7\xb7\x3c\x32\x1b\x01\x00\xd4\xf0\x3b\x7f\x35\x58\x94\xcf\x33\x2f\x83\x0e\x71\x0b\x97\xce\x98\xc8\xa8\x4a\xbd\x0b\x94\x81\x14\xad\x17\x6e\x00\x8d\x33\xbd\x60\xf9\x82\xb1\xff\x37\xc8\x55\x97\x97\xa0\x6e\xf4\xf0\xef\x61\xc1\x86\x32\x4e\x2b\x35\x06\x38\x36\x06\x90\x7b\x6a\x7c\x02\xb0\xf9\xf6\x15\x7b\x53\xc8\x67\xe4\xb9\x16\x6c\x76\x7b\x80\x4d\x46\xa5\x9b\x52\x16\xcd\xe7\xa4\xe9\x90\x40\xc5\xa4\x04\x33\x22\x5e\xe2\x82\xa1\xb0\xa0\x6c\x52\x3e\xaf\x45\x34\xd7\xf8\x3f\xa1\x15\x5b\x00\x47\x71\x8c\xbc\x54\x6a\x0d\x07\x2b\x04\xb3\x56\x4e\xea\x1b\x42\x22\x73\xf5\x48\x27\x1a\x0b\xb2\x31\x60\x53\xfa\x76\x99\x19\x55\xeb\xd6\x31\x59\x43\x4e\xce\xbb\x4e\x46\x6d\xae\x5a\x10\x73\xa6\x72\x76\x27\x09\x7a\x10\x49\xe6\x17\xd9\x1d\x36\x10\x94\xfa\x68\xf0\xff\x77\x98\x71\x30\x30\x5b\xea\xba\x2e\xda\x04\xdf\x99\x7b\x71\x4d\x6c\x6f\x2c\x29\xa6\xad\x5c\xb4\x02\x2b\x02\x70\x9b\xee\xad\x9d\x67\x89\x0c\xbb\x22\x39\x23\x36\xfe\xa1\x85\x1f\x38" +} }; -+static const struct chacha20poly1305_testvec chacha20poly1305_dec_vectors[] = { { ++static const struct chacha20poly1305_testvec chacha20poly1305_dec_vectors[] __initconst = { { + .key = "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a\xf3\x33\x88\x86\x04\xf6\xb5\xf0\x47\x39\x17\xc1\x40\x2b\x80\x09\x9d\xca\x5c\xbc\x20\x70\x75\xc0", + .nonce = "\x01\x02\x03\x04\x05\x06\x07\x08", + .assoc = "\xf3\x33\x88\x86\x00\x00\x00\x00\x00\x00\x4e\x91", @@ -5566,7 +5458,7 @@ + .result = "\x49\x6e\x74\x65\x72\x6e\x65\x74\x2d\x44\x72\x61\x66\x74\x73\x20\x61\x72\x65\x20\x64\x72\x61\x66\x74\x20\x64\x6f\x63\x75\x6d\x65\x6e\x74\x73\x20\x76\x61\x6c\x69\x64\x20\x66\x6f\x72\x20\x61\x20\x6d\x61\x78\x69\x6d\x75\x6d\x20\x6f\x66\x20\x73\x69\x78\x20\x6d\x6f\x6e\x74\x68\x73\x20\x61\x6e\x64\x20\x6d\x61\x79\x20\x62\x65\x20\x75\x70\x64\x61\x74\x65\x64\x2c\x20\x72\x65\x70\x6c\x61\x63\x65\x64\x2c\x20\x6f\x72\x20\x6f\x62\x73\x6f\x6c\x65\x74\x65\x64\x20\x62\x79\x20\x6f\x74\x68\x65\x72\x20\x64\x6f\x63\x75\x6d\x65\x6e\x74\x73\x20\x61\x74\x20\x61\x6e\x79\x20\x74\x69\x6d\x65\x2e\x20\x49\x74\x20\x69\x73\x20\x69\x6e\x61\x70\x70\x72\x6f\x70\x72\x69\x61\x74\x65\x20\x74\x6f\x20\x75\x73\x65\x20\x49\x6e\x74\x65\x72\x6e\x65\x74\x2d\x44\x72\x61\x66\x74\x73\x20\x61\x73\x20\x72\x65\x66\x65\x72\x65\x6e\x63\x65\x20\x6d\x61\x74\x65\x72\x69\x61\x6c\x20\x6f\x72\x20\x74\x6f\x20\x63\x69\x74\x65\x20\x74\x68\x65\x6d\x20\x6f\x74\x68\x65\x72\x20\x74\x68\x61\x6e\x20\x61\x73\x20\x2f\xe2\x80\x9c\x77\x6f\x72\x6b\x20\x69\x6e\x20\x70\x72\x6f\x67\x72\x65\x73\x73\x2e\x2f\xe2\x80\x9d" +} }; + -+static const struct chacha20poly1305_testvec xchacha20poly1305_enc_vectors[] = { { ++static const struct chacha20poly1305_testvec xchacha20poly1305_enc_vectors[] __initconst = { { + .key = "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a\xf3\x33\x88\x86\x04\xf6\xb5\xf0\x47\x39\x17\xc1\x40\x2b\x80\x09\x9d\xca\x5c\xbc\x20\x70\x75\xc0", + .nonce = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17", + .assoc = "\xf3\x33\x88\x86\x00\x00\x00\x00\x00\x00\x4e\x91", @@ -5575,7 +5467,7 @@ + .ilen = 265, + .result = "\x1a\x6e\x3a\xd9\xfd\x41\x3f\x77\x54\x72\x0a\x70\x9a\xa0\x29\x92\x2e\xed\x93\xcf\x0f\x71\x88\x18\x7a\x9d\x2d\x24\xe0\xf5\xea\x3d\x55\x64\xd7\xad\x2a\x1a\x1f\x7e\x86\x6d\xb0\xce\x80\x41\x72\x86\x26\xee\x84\xd7\xef\x82\x9e\xe2\x60\x9d\x5a\xfc\xf0\xe4\x19\x85\xea\x09\xc6\xfb\xb3\xa9\x50\x09\xec\x5e\x11\x90\xa1\xc5\x4e\x49\xef\x50\xd8\x8f\xe0\x78\xd7\xfd\xb9\x3b\xc9\xf2\x91\xc8\x25\xc8\xa7\x63\x60\xce\x10\xcd\xc6\x7f\xf8\x16\xf8\xe1\x0a\xd9\xde\x79\x50\x33\xf2\x16\x0f\x17\xba\xb8\x5d\xd8\xdf\x4e\x51\xa8\x39\xd0\x85\xca\x46\x6a\x10\xa7\xa3\x88\xef\x79\xb9\xf8\x24\xf3\xe0\x71\x7b\x76\x28\x46\x3a\x3a\x1b\x91\xb6\xd4\x3e\x23\xe5\x44\x15\xbf\x60\x43\x9d\xa4\xbb\xd5\x5f\x89\xeb\xef\x8e\xfd\xdd\xb4\x0d\x46\xf0\x69\x23\x63\xae\x94\xf5\x5e\xa5\xad\x13\x1c\x41\x76\xe6\x90\xd6\x6d\xa2\x8f\x97\x4c\xa8\x0b\xcf\x8d\x43\x2b\x9c\x9b\xc5\x58\xa5\xb6\x95\x9a\xbf\x81\xc6\x54\xc9\x66\x0c\xe5\x4f\x6a\x53\xa1\xe5\x0c\xba\x31\xde\x34\x64\x73\x8a\x3b\xbd\x92\x01\xdb\x71\x69\xf3\x58\x99\xbc\xd1\xcb\x4a\x05\xe2\x58\x9c\x25\x17\xcd\xdc\x83\xb7\xff\xfb\x09\x61\xad\xbf\x13\x5b\x5e\xed\x46\x82\x6f\x22\xd8\x93\xa6\x85\x5b\x40\x39\x5c\xc5\x9c" +} }; -+static const struct chacha20poly1305_testvec xchacha20poly1305_dec_vectors[] = { { ++static const struct chacha20poly1305_testvec xchacha20poly1305_dec_vectors[] __initconst = { { + .key = "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a\xf3\x33\x88\x86\x04\xf6\xb5\xf0\x47\x39\x17\xc1\x40\x2b\x80\x09\x9d\xca\x5c\xbc\x20\x70\x75\xc0", + .nonce = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17", + .assoc = "\xf3\x33\x88\x86\x00\x00\x00\x00\x00\x00\x4e\x91", @@ -5585,15 +5477,15 @@ + .result = "\x49\x6e\x74\x65\x72\x6e\x65\x74\x2d\x44\x72\x61\x66\x74\x73\x20\x61\x72\x65\x20\x64\x72\x61\x66\x74\x20\x64\x6f\x63\x75\x6d\x65\x6e\x74\x73\x20\x76\x61\x6c\x69\x64\x20\x66\x6f\x72\x20\x61\x20\x6d\x61\x78\x69\x6d\x75\x6d\x20\x6f\x66\x20\x73\x69\x78\x20\x6d\x6f\x6e\x74\x68\x73\x20\x61\x6e\x64\x20\x6d\x61\x79\x20\x62\x65\x20\x75\x70\x64\x61\x74\x65\x64\x2c\x20\x72\x65\x70\x6c\x61\x63\x65\x64\x2c\x20\x6f\x72\x20\x6f\x62\x73\x6f\x6c\x65\x74\x65\x64\x20\x62\x79\x20\x6f\x74\x68\x65\x72\x20\x64\x6f\x63\x75\x6d\x65\x6e\x74\x73\x20\x61\x74\x20\x61\x6e\x79\x20\x74\x69\x6d\x65\x2e\x20\x49\x74\x20\x69\x73\x20\x69\x6e\x61\x70\x70\x72\x6f\x70\x72\x69\x61\x74\x65\x20\x74\x6f\x20\x75\x73\x65\x20\x49\x6e\x74\x65\x72\x6e\x65\x74\x2d\x44\x72\x61\x66\x74\x73\x20\x61\x73\x20\x72\x65\x66\x65\x72\x65\x6e\x63\x65\x20\x6d\x61\x74\x65\x72\x69\x61\x6c\x20\x6f\x72\x20\x74\x6f\x20\x63\x69\x74\x65\x20\x74\x68\x65\x6d\x20\x6f\x74\x68\x65\x72\x20\x74\x68\x61\x6e\x20\x61\x73\x20\x2f\xe2\x80\x9c\x77\x6f\x72\x6b\x20\x69\x6e\x20\x70\x72\x6f\x67\x72\x65\x73\x73\x2e\x2f\xe2\x80\x9d" +} }; + -+bool chacha20poly1305_selftest(void) ++bool __init chacha20poly1305_selftest(void) +{ + size_t i; + u8 computed_result[512]; -+ bool success = true; ++ bool success = true, ret; + + for (i = 0; i < ARRAY_SIZE(chacha20poly1305_enc_vectors); ++i) { + memset(computed_result, 0, sizeof(computed_result)); -+ success = chacha20poly1305_encrypt(computed_result, chacha20poly1305_enc_vectors[i].input, chacha20poly1305_enc_vectors[i].ilen, chacha20poly1305_enc_vectors[i].assoc, chacha20poly1305_enc_vectors[i].alen, le64_to_cpu(*(__force __le64 *)chacha20poly1305_enc_vectors[i].nonce), chacha20poly1305_enc_vectors[i].key); ++ chacha20poly1305_encrypt(computed_result, chacha20poly1305_enc_vectors[i].input, chacha20poly1305_enc_vectors[i].ilen, chacha20poly1305_enc_vectors[i].assoc, chacha20poly1305_enc_vectors[i].alen, le64_to_cpu(*(__force __le64 *)chacha20poly1305_enc_vectors[i].nonce), chacha20poly1305_enc_vectors[i].key); + if (memcmp(computed_result, chacha20poly1305_enc_vectors[i].result, chacha20poly1305_enc_vectors[i].ilen + POLY1305_MAC_SIZE)) { + pr_info("chacha20poly1305 encryption self-test %zu: FAIL\n", i + 1); + success = false; @@ -5601,15 +5493,15 @@ + } + for (i = 0; i < ARRAY_SIZE(chacha20poly1305_dec_vectors); ++i) { + memset(computed_result, 0, sizeof(computed_result)); -+ success = chacha20poly1305_decrypt(computed_result, chacha20poly1305_dec_vectors[i].input, chacha20poly1305_dec_vectors[i].ilen, chacha20poly1305_dec_vectors[i].assoc, chacha20poly1305_dec_vectors[i].alen, le64_to_cpu(*(__force __le64 *)chacha20poly1305_dec_vectors[i].nonce), chacha20poly1305_dec_vectors[i].key); -+ if (!success || memcmp(computed_result, chacha20poly1305_dec_vectors[i].result, chacha20poly1305_dec_vectors[i].ilen - POLY1305_MAC_SIZE)) { ++ ret = chacha20poly1305_decrypt(computed_result, chacha20poly1305_dec_vectors[i].input, chacha20poly1305_dec_vectors[i].ilen, chacha20poly1305_dec_vectors[i].assoc, chacha20poly1305_dec_vectors[i].alen, le64_to_cpu(*(__force __le64 *)chacha20poly1305_dec_vectors[i].nonce), chacha20poly1305_dec_vectors[i].key); ++ if (!ret || memcmp(computed_result, chacha20poly1305_dec_vectors[i].result, chacha20poly1305_dec_vectors[i].ilen - POLY1305_MAC_SIZE)) { + pr_info("chacha20poly1305 decryption self-test %zu: FAIL\n", i + 1); + success = false; + } + } + for (i = 0; i < ARRAY_SIZE(xchacha20poly1305_enc_vectors); ++i) { + memset(computed_result, 0, sizeof(computed_result)); -+ success = xchacha20poly1305_encrypt(computed_result, xchacha20poly1305_enc_vectors[i].input, xchacha20poly1305_enc_vectors[i].ilen, xchacha20poly1305_enc_vectors[i].assoc, xchacha20poly1305_enc_vectors[i].alen, xchacha20poly1305_enc_vectors[i].nonce, xchacha20poly1305_enc_vectors[i].key); ++ xchacha20poly1305_encrypt(computed_result, xchacha20poly1305_enc_vectors[i].input, xchacha20poly1305_enc_vectors[i].ilen, xchacha20poly1305_enc_vectors[i].assoc, xchacha20poly1305_enc_vectors[i].alen, xchacha20poly1305_enc_vectors[i].nonce, xchacha20poly1305_enc_vectors[i].key); + if (memcmp(computed_result, xchacha20poly1305_enc_vectors[i].result, xchacha20poly1305_enc_vectors[i].ilen + POLY1305_MAC_SIZE)) { + pr_info("xchacha20poly1305 encryption self-test %zu: FAIL\n", i + 1); + success = false; @@ -5617,8 +5509,8 @@ + } + for (i = 0; i < ARRAY_SIZE(xchacha20poly1305_dec_vectors); ++i) { + memset(computed_result, 0, sizeof(computed_result)); -+ success = xchacha20poly1305_decrypt(computed_result, xchacha20poly1305_dec_vectors[i].input, xchacha20poly1305_dec_vectors[i].ilen, xchacha20poly1305_dec_vectors[i].assoc, xchacha20poly1305_dec_vectors[i].alen, xchacha20poly1305_dec_vectors[i].nonce, xchacha20poly1305_dec_vectors[i].key); -+ if (!success || memcmp(computed_result, xchacha20poly1305_dec_vectors[i].result, xchacha20poly1305_dec_vectors[i].ilen - POLY1305_MAC_SIZE)) { ++ ret = xchacha20poly1305_decrypt(computed_result, xchacha20poly1305_dec_vectors[i].input, xchacha20poly1305_dec_vectors[i].ilen, xchacha20poly1305_dec_vectors[i].assoc, xchacha20poly1305_dec_vectors[i].alen, xchacha20poly1305_dec_vectors[i].nonce, xchacha20poly1305_dec_vectors[i].key); ++ if (!ret || memcmp(computed_result, xchacha20poly1305_dec_vectors[i].result, xchacha20poly1305_dec_vectors[i].ilen - POLY1305_MAC_SIZE)) { + pr_info("xchacha20poly1305 decryption self-test %zu: FAIL\n", i + 1); + success = false; + } @@ -5628,13 +5520,13 @@ + return success; +} +#endif ---- /dev/null -+++ b/net/wireguard/selftest/counter.h +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/selftest/counter.h 2017-07-06 18:17:33.000000000 +0200 @@ -0,0 +1,89 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#ifdef DEBUG -+bool packet_counter_selftest(void) ++bool __init packet_counter_selftest(void) +{ + bool success = true; + unsigned int test_num = 0, i; @@ -5720,9 +5612,9 @@ + return success; +} +#endif ---- /dev/null -+++ b/net/wireguard/selftest/curve25519.h -@@ -0,0 +1,66 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/selftest/curve25519.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,74 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#ifdef DEBUG @@ -5730,54 +5622,62 @@ + u8 private[CURVE25519_POINT_SIZE]; + u8 public[CURVE25519_POINT_SIZE]; + u8 result[CURVE25519_POINT_SIZE]; ++ bool valid; +}; -+static const struct curve25519_test_vector curve25519_test_vectors[] = { ++static const struct curve25519_test_vector curve25519_test_vectors[] __initconst = { + { + .private = { 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d, 0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45, 0xdf, 0x4c, 0x2f, 0x87, 0xeb, 0xc0, 0x99, 0x2a, 0xb1, 0x77, 0xfb, 0xa5, 0x1d, 0xb9, 0x2c, 0x2a }, + .public = { 0xde, 0x9e, 0xdb, 0x7d, 0x7b, 0x7d, 0xc1, 0xb4, 0xd3, 0x5b, 0x61, 0xc2, 0xec, 0xe4, 0x35, 0x37, 0x3f, 0x83, 0x43, 0xc8, 0x5b, 0x78, 0x67, 0x4d, 0xad, 0xfc, 0x7e, 0x14, 0x6f, 0x88, 0x2b, 0x4f }, -+ .result = { 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 } ++ .result = { 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 }, ++ .valid = true + }, + { + .private = { 0x5d, 0xab, 0x08, 0x7e, 0x62, 0x4a, 0x8a, 0x4b, 0x79, 0xe1, 0x7f, 0x8b, 0x83, 0x80, 0x0e, 0xe6, 0x6f, 0x3b, 0xb1, 0x29, 0x26, 0x18, 0xb6, 0xfd, 0x1c, 0x2f, 0x8b, 0x27, 0xff, 0x88, 0xe0, 0xeb }, + .public = { 0x85, 0x20, 0xf0, 0x09, 0x89, 0x30, 0xa7, 0x54, 0x74, 0x8b, 0x7d, 0xdc, 0xb4, 0x3e, 0xf7, 0x5a, 0x0d, 0xbf, 0x3a, 0x0d, 0x26, 0x38, 0x1a, 0xf4, 0xeb, 0xa4, 0xa9, 0x8e, 0xaa, 0x9b, 0x4e, 0x6a }, -+ .result = { 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 } ++ .result = { 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 }, ++ .valid = true + }, + { + .private = { 1 }, + .public = { 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, -+ .result = { 0x3c, 0x77, 0x77, 0xca, 0xf9, 0x97, 0xb2, 0x64, 0x41, 0x60, 0x77, 0x66, 0x5b, 0x4e, 0x22, 0x9d, 0xb, 0x95, 0x48, 0xdc, 0xc, 0xd8, 0x19, 0x98, 0xdd, 0xcd, 0xc5, 0xc8, 0x53, 0x3c, 0x79, 0x7f } ++ .result = { 0x3c, 0x77, 0x77, 0xca, 0xf9, 0x97, 0xb2, 0x64, 0x41, 0x60, 0x77, 0x66, 0x5b, 0x4e, 0x22, 0x9d, 0xb, 0x95, 0x48, 0xdc, 0xc, 0xd8, 0x19, 0x98, 0xdd, 0xcd, 0xc5, 0xc8, 0x53, 0x3c, 0x79, 0x7f }, ++ .valid = true + }, + { + .private = { 1 }, + .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, -+ .result = { 0xb3, 0x2d, 0x13, 0x62, 0xc2, 0x48, 0xd6, 0x2f, 0xe6, 0x26, 0x19, 0xcf, 0xf0, 0x4d, 0xd4, 0x3d, 0xb7, 0x3f, 0xfc, 0x1b, 0x63, 0x8, 0xed, 0xe3, 0xb, 0x78, 0xd8, 0x73, 0x80, 0xf1, 0xe8, 0x34 } ++ .result = { 0xb3, 0x2d, 0x13, 0x62, 0xc2, 0x48, 0xd6, 0x2f, 0xe6, 0x26, 0x19, 0xcf, 0xf0, 0x4d, 0xd4, 0x3d, 0xb7, 0x3f, 0xfc, 0x1b, 0x63, 0x8, 0xed, 0xe3, 0xb, 0x78, 0xd8, 0x73, 0x80, 0xf1, 0xe8, 0x34 }, ++ .valid = true + }, + { + .private = { 0xa5, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d, 0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd, 0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18, 0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0xc4 }, + .public = { 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb, 0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c, 0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b, 0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c }, -+ .result = { 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90, 0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f, 0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7, 0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 } ++ .result = { 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90, 0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f, 0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7, 0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 }, ++ .valid = true + }, + { + .private = { 1, 2, 3, 4 }, + .public = { 0 }, -+ .result = { 0 } ++ .result = { 0 }, ++ .valid = false + }, + { + .private = { 2, 4, 6, 8 }, + .public = { 0xe0, 0xeb, 0x7a, 0x7c, 0x3b, 0x41, 0xb8, 0xae, 0x16, 0x56, 0xe3, 0xfa, 0xf1, 0x9f, 0xc4, 0x6a, 0xda, 0x09, 0x8d, 0xeb, 0x9c, 0x32, 0xb1, 0xfd, 0x86, 0x62, 0x05, 0x16, 0x5f, 0x49, 0xb8 }, -+ .result = { 0 } ++ .result = { 0 }, ++ .valid = false + } +}; -+bool curve25519_selftest(void) ++bool __init curve25519_selftest(void) +{ -+ bool success = true; ++ bool success = true, ret; + size_t i = 0; + u8 out[CURVE25519_POINT_SIZE]; + + for (i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) { + memset(out, 0, CURVE25519_POINT_SIZE); -+ curve25519(out, curve25519_test_vectors[i].private, curve25519_test_vectors[i].public); -+ if (memcmp(out, curve25519_test_vectors[i].result, CURVE25519_POINT_SIZE)) { ++ ret = curve25519(out, curve25519_test_vectors[i].private, curve25519_test_vectors[i].public); ++ if (ret != curve25519_test_vectors[i].valid || memcmp(out, curve25519_test_vectors[i].result, CURVE25519_POINT_SIZE)) { + pr_info("curve25519 self-test %zu: FAIL\n", i + 1); + success = false; + break; @@ -5789,13 +5689,446 @@ + return success; +} +#endif ---- /dev/null -+++ b/net/wireguard/selftest/routing-table.h -@@ -0,0 +1,133 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/selftest/ratelimiter.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,113 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#ifdef DEBUG -+static inline struct in_addr *ip4(u8 a, u8 b, u8 c, u8 d) ++ ++static const struct { bool result; unsigned int msec_to_sleep_before; } expected_results[] __initconst = { ++ [0 ... PACKETS_BURSTABLE - 1] = { true, 0 }, ++ [PACKETS_BURSTABLE] = { false, 0 }, ++ [PACKETS_BURSTABLE + 1] = { true, MSEC_PER_SEC / PACKETS_PER_SECOND }, ++ [PACKETS_BURSTABLE + 2] = { false, 0 }, ++ [PACKETS_BURSTABLE + 3] = { true, (MSEC_PER_SEC / PACKETS_PER_SECOND) * 2 }, ++ [PACKETS_BURSTABLE + 4] = { true, 0 }, ++ [PACKETS_BURSTABLE + 5] = { false, 0 } ++}; ++ ++bool __init ratelimiter_selftest(void) ++{ ++ struct sk_buff *skb4; ++ struct iphdr *hdr4; ++#if IS_ENABLED(CONFIG_IPV6) ++ struct sk_buff *skb6; ++ struct ipv6hdr *hdr6; ++#endif ++ int i = -1, ret = false; ++ ++ BUILD_BUG_ON(MSEC_PER_SEC % PACKETS_PER_SECOND != 0); ++ ++ if (ratelimiter_init()) ++ goto out; ++ if (ratelimiter_init()) { ++ ratelimiter_uninit(); ++ goto out; ++ } ++ if (ratelimiter_init()) { ++ ratelimiter_uninit(); ++ ratelimiter_uninit(); ++ goto out; ++ } ++ ++ skb4 = alloc_skb(sizeof(struct iphdr), GFP_KERNEL); ++ if (!skb4) ++ goto err_nofree; ++ skb4->protocol = htons(ETH_P_IP); ++ hdr4 = (struct iphdr *)skb_put(skb4, sizeof(struct iphdr)); ++ hdr4->saddr = htonl(8182); ++ skb_reset_network_header(skb4); ++ ++#if IS_ENABLED(CONFIG_IPV6) ++ skb6 = alloc_skb(sizeof(struct ipv6hdr), GFP_KERNEL); ++ if (!skb6) { ++ kfree_skb(skb4); ++ goto err_nofree; ++ } ++ skb6->protocol = htons(ETH_P_IPV6); ++ hdr6 = (struct ipv6hdr *)skb_put(skb6, sizeof(struct ipv6hdr)); ++ hdr6->saddr.in6_u.u6_addr32[0] = htonl(1212); ++ hdr6->saddr.in6_u.u6_addr32[1] = htonl(289188); ++ skb_reset_network_header(skb6); ++#endif ++ ++ for (i = 0; i < ARRAY_SIZE(expected_results); ++i) { ++ if (expected_results[i].msec_to_sleep_before) ++ msleep(expected_results[i].msec_to_sleep_before); ++ ++ if (ratelimiter_allow(skb4, &init_net) != expected_results[i].result) ++ goto err; ++ hdr4->saddr = htonl(ntohl(hdr4->saddr) + i + 1); ++ if (!ratelimiter_allow(skb4, &init_net)) ++ goto err; ++ hdr4->saddr = htonl(ntohl(hdr4->saddr) - i - 1); ++ ++#if IS_ENABLED(CONFIG_IPV6) ++ hdr6->saddr.in6_u.u6_addr32[2] = hdr6->saddr.in6_u.u6_addr32[3] = htonl(i); ++ if (ratelimiter_allow(skb6, &init_net) != expected_results[i].result) ++ goto err; ++ hdr6->saddr.in6_u.u6_addr32[0] = htonl(ntohl(hdr6->saddr.in6_u.u6_addr32[0]) + i + 1); ++ if (!ratelimiter_allow(skb6, &init_net)) ++ goto err; ++ hdr6->saddr.in6_u.u6_addr32[0] = htonl(ntohl(hdr6->saddr.in6_u.u6_addr32[0]) - i - 1); ++#endif ++ } ++ ++ gc_entries(NULL); ++ rcu_barrier_bh(); ++ ++ if (atomic_read(&total_entries)) ++ goto err; ++ ++ for (i = 0; i <= max_entries; ++i) { ++ hdr4->saddr = htonl(i); ++ if (ratelimiter_allow(skb4, &init_net) != (i != max_entries)) ++ goto err; ++ } ++ ++ ret = true; ++ ++err: ++ kfree_skb(skb4); ++#if IS_ENABLED(CONFIG_IPV6) ++ kfree_skb(skb6); ++#endif ++err_nofree: ++ ratelimiter_uninit(); ++ ratelimiter_uninit(); ++ ratelimiter_uninit(); ++out: ++ if (ret) ++ pr_info("ratelimiter self-tests: pass\n"); ++ else ++ pr_info("ratelimiter self-test %d: fail\n", i); ++ ++ return ret; ++} ++#endif +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/selftest/routingtable.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,504 @@ ++/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ ++ ++#ifdef DEBUG ++ ++#ifdef DEBUG_PRINT_TRIE_GRAPHVIZ ++#include ++static __init void print_node(struct routing_table_node *node, u8 bits) ++{ ++ u32 color = 0; ++ char *style = "dotted"; ++ char *fmt_connection = KERN_DEBUG "\t\"%p/%d\" -> \"%p/%d\";\n"; ++ char *fmt_declaration = KERN_DEBUG "\t\"%p/%d\"[style=%s, color=\"#%06x\"];\n"; ++ if (bits == 32) { ++ fmt_connection = KERN_DEBUG "\t\"%pI4/%d\" -> \"%pI4/%d\";\n"; ++ fmt_declaration = KERN_DEBUG "\t\"%pI4/%d\"[style=%s, color=\"#%06x\"];\n"; ++ } else if (bits == 128) { ++ fmt_connection = KERN_DEBUG "\t\"%pI6/%d\" -> \"%pI6/%d\";\n"; ++ fmt_declaration = KERN_DEBUG "\t\"%pI6/%d\"[style=%s, color=\"#%06x\"];\n"; ++ } ++ if (node->peer) { ++ hsiphash_key_t key = { 0 }; ++ memcpy(&key, &node->peer, sizeof(node->peer)); ++ color = hsiphash_1u32(0xdeadbeef, &key) % 200 << 16 | hsiphash_1u32(0xbabecafe, &key) % 200 << 8 | hsiphash_1u32(0xabad1dea, &key) % 200; ++ style = "bold"; ++ } ++ printk(fmt_declaration, node->bits, node->cidr, style, color); ++ if (node->bit[0]) { ++ printk(fmt_connection, node->bits, node->cidr, node->bit[0]->bits, node->bit[0]->cidr); ++ print_node(node->bit[0], bits); ++ } ++ if (node->bit[1]) { ++ printk(fmt_connection, node->bits, node->cidr, node->bit[1]->bits, node->bit[1]->cidr); ++ print_node(node->bit[1], bits); ++ } ++} ++static __init void print_tree(struct routing_table_node *top, u8 bits) ++{ ++ printk(KERN_DEBUG "digraph trie {\n"); ++ print_node(top, bits); ++ printk(KERN_DEBUG "}\n"); ++} ++#endif ++ ++#ifdef DEBUG_RANDOM_TRIE ++#define NUM_PEERS 2000 ++#define NUM_RAND_ROUTES 400 ++#define NUM_MUTATED_ROUTES 100 ++#define NUM_QUERIES (NUM_RAND_ROUTES * NUM_MUTATED_ROUTES * 30) ++#include ++struct horrible_routing_table { ++ struct hlist_head head; ++}; ++struct horrible_routing_table_node { ++ struct hlist_node table; ++ union nf_inet_addr ip; ++ union nf_inet_addr mask; ++ uint8_t ip_version; ++ void *value; ++}; ++static __init void horrible_routing_table_init(struct horrible_routing_table *table) ++{ ++ INIT_HLIST_HEAD(&table->head); ++} ++static __init void horrible_routing_table_free(struct horrible_routing_table *table) ++{ ++ struct hlist_node *h; ++ struct horrible_routing_table_node *node; ++ hlist_for_each_entry_safe (node, h, &table->head, table) { ++ hlist_del(&node->table); ++ kfree(node); ++ }; ++} ++static __init inline union nf_inet_addr horrible_cidr_to_mask(uint8_t cidr) ++{ ++ union nf_inet_addr mask; ++ memset(&mask, 0x00, 128 / 8); ++ memset(&mask, 0xff, cidr / 8); ++ if (cidr % 32) ++ mask.all[cidr / 32] = htonl((0xFFFFFFFFUL << (32 - (cidr % 32))) & 0xFFFFFFFFUL); ++ return mask; ++} ++static __init inline uint8_t horrible_mask_to_cidr(union nf_inet_addr subnet) ++{ ++ return hweight32(subnet.all[0]) ++ + hweight32(subnet.all[1]) ++ + hweight32(subnet.all[2]) ++ + hweight32(subnet.all[3]); ++} ++static __init inline void horrible_mask_self(struct horrible_routing_table_node *node) ++{ ++ if (node->ip_version == 4) ++ node->ip.ip &= node->mask.ip; ++ else if (node->ip_version == 6) { ++ node->ip.ip6[0] &= node->mask.ip6[0]; ++ node->ip.ip6[1] &= node->mask.ip6[1]; ++ node->ip.ip6[2] &= node->mask.ip6[2]; ++ node->ip.ip6[3] &= node->mask.ip6[3]; ++ } ++} ++static __init inline bool horrible_match_v4(const struct horrible_routing_table_node *node, struct in_addr *ip) ++{ ++ return (ip->s_addr & node->mask.ip) == node->ip.ip; ++} ++static __init inline bool horrible_match_v6(const struct horrible_routing_table_node *node, struct in6_addr *ip) ++{ ++ return (ip->in6_u.u6_addr32[0] & node->mask.ip6[0]) == node->ip.ip6[0] && ++ (ip->in6_u.u6_addr32[1] & node->mask.ip6[1]) == node->ip.ip6[1] && ++ (ip->in6_u.u6_addr32[2] & node->mask.ip6[2]) == node->ip.ip6[2] && ++ (ip->in6_u.u6_addr32[3] & node->mask.ip6[3]) == node->ip.ip6[3]; ++} ++static __init void horrible_insert_ordered(struct horrible_routing_table *table, struct horrible_routing_table_node *node) ++{ ++ struct horrible_routing_table_node *other = NULL, *where = NULL; ++ uint8_t my_cidr = horrible_mask_to_cidr(node->mask); ++ hlist_for_each_entry (other, &table->head, table) { ++ if (!memcmp(&other->mask, &node->mask, sizeof(union nf_inet_addr)) && ++ !memcmp(&other->ip, &node->ip, sizeof(union nf_inet_addr)) && ++ other->ip_version == node->ip_version) { ++ other->value = node->value; ++ kfree(node); ++ return; ++ } ++ where = other; ++ if (horrible_mask_to_cidr(other->mask) <= my_cidr) ++ break; ++ } ++ if (!other && !where) ++ hlist_add_head(&node->table, &table->head); ++ else if (!other) ++ hlist_add_behind(&node->table, &where->table); ++ else ++ hlist_add_before(&node->table, &where->table); ++} ++static __init int horrible_routing_table_insert_v4(struct horrible_routing_table *table, struct in_addr *ip, uint8_t cidr, void *value) ++{ ++ struct horrible_routing_table_node *node = kzalloc(sizeof(struct horrible_routing_table_node), GFP_KERNEL); ++ if (!node) ++ return -ENOMEM; ++ node->ip.in = *ip; ++ node->mask = horrible_cidr_to_mask(cidr); ++ node->ip_version = 4; ++ node->value = value; ++ horrible_mask_self(node); ++ horrible_insert_ordered(table, node); ++ return 0; ++} ++static __init int horrible_routing_table_insert_v6(struct horrible_routing_table *table, struct in6_addr *ip, uint8_t cidr, void *value) ++{ ++ struct horrible_routing_table_node *node = kzalloc(sizeof(struct horrible_routing_table_node), GFP_KERNEL); ++ if (!node) ++ return -ENOMEM; ++ node->ip.in6 = *ip; ++ node->mask = horrible_cidr_to_mask(cidr); ++ node->ip_version = 6; ++ node->value = value; ++ horrible_mask_self(node); ++ horrible_insert_ordered(table, node); ++ return 0; ++} ++static __init void *horrible_routing_table_lookup_v4(struct horrible_routing_table *table, struct in_addr *ip) ++{ ++ struct horrible_routing_table_node *node; ++ void *ret = NULL; ++ hlist_for_each_entry (node, &table->head, table) { ++ if (node->ip_version != 4) ++ continue; ++ if (horrible_match_v4(node, ip)) { ++ ret = node->value; ++ break; ++ } ++ }; ++ return ret; ++} ++static __init void *horrible_routing_table_lookup_v6(struct horrible_routing_table *table, struct in6_addr *ip) ++{ ++ struct horrible_routing_table_node *node; ++ void *ret = NULL; ++ hlist_for_each_entry (node, &table->head, table) { ++ if (node->ip_version != 6) ++ continue; ++ if (horrible_match_v6(node, ip)) { ++ ret = node->value; ++ break; ++ } ++ }; ++ return ret; ++} ++ ++static __init bool randomized_test(void) ++{ ++ bool ret = false; ++ unsigned int i, j, k, mutate_amount, cidr; ++ struct wireguard_peer **peers, *peer; ++ struct routing_table t; ++ struct horrible_routing_table h; ++ u8 ip[16], mutate_mask[16], mutated[16]; ++ ++ routing_table_init(&t); ++ horrible_routing_table_init(&h); ++ ++ peers = kcalloc(NUM_PEERS, sizeof(struct wireguard_peer *), GFP_KERNEL); ++ if (!peers) { ++ pr_info("routing table random self-test: out of memory\n"); ++ goto free; ++ } ++ for (i = 0; i < NUM_PEERS; ++i) { ++ peers[i] = kzalloc(sizeof(struct wireguard_peer), GFP_KERNEL); ++ if (!peers[i]) { ++ pr_info("routing table random self-test: out of memory\n"); ++ goto free; ++ } ++ kref_init(&peers[i]->refcount); ++ } ++ ++ for (i = 0; i < NUM_RAND_ROUTES; ++i) { ++ prandom_bytes(ip, 4); ++ cidr = prandom_u32_max(32) + 1; ++ peer = peers[prandom_u32_max(NUM_PEERS)]; ++ if (routing_table_insert_v4(&t, (struct in_addr *)ip, cidr, peer) < 0) { ++ pr_info("routing table random self-test: out of memory\n"); ++ goto free; ++ } ++ if (horrible_routing_table_insert_v4(&h, (struct in_addr *)ip, cidr, peer) < 0) { ++ pr_info("routing table random self-test: out of memory\n"); ++ goto free; ++ } ++ for (j = 0; j < NUM_MUTATED_ROUTES; ++j) { ++ memcpy(mutated, ip, 4); ++ prandom_bytes(mutate_mask, 4); ++ mutate_amount = prandom_u32_max(32); ++ for (k = 0; k < mutate_amount / 8; ++k) ++ mutate_mask[k] = 0xff; ++ mutate_mask[k] = 0xff << ((8 - (mutate_amount % 8)) % 8); ++ for (; k < 4; ++k) ++ mutate_mask[k] = 0; ++ for (k = 0; k < 4; ++k) ++ mutated[k] = (mutated[k] & mutate_mask[k]) | (~mutate_mask[k] & prandom_u32_max(256)); ++ cidr = prandom_u32_max(32) + 1; ++ peer = peers[prandom_u32_max(NUM_PEERS)]; ++ if (routing_table_insert_v4(&t, (struct in_addr *)mutated, cidr, peer) < 0) { ++ pr_info("routing table random self-test: out of memory\n"); ++ goto free; ++ } ++ if (horrible_routing_table_insert_v4(&h, (struct in_addr *)mutated, cidr, peer)) { ++ pr_info("routing table random self-test: out of memory\n"); ++ goto free; ++ } ++ } ++ } ++ ++ for (i = 0; i < NUM_RAND_ROUTES; ++i) { ++ prandom_bytes(ip, 16); ++ cidr = prandom_u32_max(128) + 1; ++ peer = peers[prandom_u32_max(NUM_PEERS)]; ++ if (routing_table_insert_v6(&t, (struct in6_addr *)ip, cidr, peer) < 0) { ++ pr_info("routing table random self-test: out of memory\n"); ++ goto free; ++ } ++ if (horrible_routing_table_insert_v6(&h, (struct in6_addr *)ip, cidr, peer) < 0) { ++ pr_info("routing table random self-test: out of memory\n"); ++ goto free; ++ } ++ for (j = 0; j < NUM_MUTATED_ROUTES; ++j) { ++ memcpy(mutated, ip, 16); ++ prandom_bytes(mutate_mask, 16); ++ mutate_amount = prandom_u32_max(128); ++ for (k = 0; k < mutate_amount / 8; ++k) ++ mutate_mask[k] = 0xff; ++ mutate_mask[k] = 0xff << ((8 - (mutate_amount % 8)) % 8); ++ for (; k < 4; ++k) ++ mutate_mask[k] = 0; ++ for (k = 0; k < 4; ++k) ++ mutated[k] = (mutated[k] & mutate_mask[k]) | (~mutate_mask[k] & prandom_u32_max(256)); ++ cidr = prandom_u32_max(128) + 1; ++ peer = peers[prandom_u32_max(NUM_PEERS)]; ++ if (routing_table_insert_v6(&t, (struct in6_addr *)mutated, cidr, peer) < 0) { ++ pr_info("routing table random self-test: out of memory\n"); ++ goto free; ++ } ++ if (horrible_routing_table_insert_v6(&h, (struct in6_addr *)mutated, cidr, peer)) { ++ pr_info("routing table random self-test: out of memory\n"); ++ goto free; ++ } ++ } ++ } ++ ++#ifdef DEBUG_PRINT_TRIE_GRAPHVIZ ++ print_tree(t.root4, 32); ++ print_tree(t.root6, 128); ++#endif ++ ++ for (i = 0; i < NUM_QUERIES; ++i) { ++ prandom_bytes(ip, 4); ++ if (lookup(t.root4, 32, ip) != horrible_routing_table_lookup_v4(&h, (struct in_addr *)ip)) { ++ pr_info("routing table random self-test: FAIL\n"); ++ goto free; ++ } ++ } ++ ++ for (i = 0; i < NUM_QUERIES; ++i) { ++ prandom_bytes(ip, 16); ++ if (lookup(t.root6, 128, ip) != horrible_routing_table_lookup_v6(&h, (struct in6_addr *)ip)) { ++ pr_info("routing table random self-test: FAIL\n"); ++ goto free; ++ } ++ } ++ ret = true; ++ ++free: ++ routing_table_free(&t); ++ horrible_routing_table_free(&h); ++ if (peers) { ++ for (i = 0; i < NUM_PEERS; ++i) ++ kfree(peers[i]); ++ } ++ kfree(peers); ++ return ret; ++} ++#endif ++ ++static __init inline struct in_addr *ip4(u8 a, u8 b, u8 c, u8 d) +{ + static struct in_addr ip; + u8 *split = (u8 *)&ip; @@ -5805,7 +6138,7 @@ + split[3] = d; + return &ip; +} -+static inline struct in6_addr *ip6(u32 a, u32 b, u32 c, u32 d) ++static __init inline struct in6_addr *ip6(u32 a, u32 b, u32 c, u32 d) +{ + static struct in6_addr ip; + __be32 *split = (__be32 *)&ip; @@ -5816,7 +6149,36 @@ + return &ip; +} + -+bool routing_table_selftest(void) ++#define init_peer(name) do { \ ++ name = kzalloc(sizeof(struct wireguard_peer), GFP_KERNEL); \ ++ if (!name) { \ ++ pr_info("routing table self-test: out of memory\n"); \ ++ goto free; \ ++ } \ ++ kref_init(&name->refcount); \ ++} while (0) ++ ++#define insert(version, mem, ipa, ipb, ipc, ipd, cidr) \ ++ routing_table_insert_v##version(&t, ip##version(ipa, ipb, ipc, ipd), cidr, mem) ++ ++#define maybe_fail \ ++ ++i; \ ++ if (!_s) { \ ++ pr_info("routing table self-test %zu: FAIL\n", i); \ ++ success = false; \ ++ } ++ ++#define test(version, mem, ipa, ipb, ipc, ipd) do { \ ++ bool _s = lookup(t.root##version, version == 4 ? 32 : 128, ip##version(ipa, ipb, ipc, ipd)) == mem; \ ++ maybe_fail \ ++} while (0) ++ ++#define test_negative(version, mem, ipa, ipb, ipc, ipd) do { \ ++ bool _s = lookup(t.root##version, version == 4 ? 32 : 128, ip##version(ipa, ipb, ipc, ipd)) != mem; \ ++ maybe_fail \ ++} while (0) ++ ++bool __init routing_table_selftest(void) +{ + struct routing_table t; + struct wireguard_peer *a = NULL, *b = NULL, *c = NULL, *d = NULL, *e = NULL, *f = NULL, *g = NULL, *h = NULL; @@ -5826,7 +6188,6 @@ + __be64 part; + + routing_table_init(&t); -+#define init_peer(name) do { name = kzalloc(sizeof(struct wireguard_peer), GFP_KERNEL); if (!name) goto free; kref_init(&name->refcount); } while (0) + init_peer(a); + init_peer(b); + init_peer(c); @@ -5835,9 +6196,7 @@ + init_peer(f); + init_peer(g); + init_peer(h); -+#undef init_peer + -+#define insert(version, mem, ipa, ipb, ipc, ipd, cidr) routing_table_insert_v##version(&t, ip##version(ipa, ipb, ipc, ipd), cidr, mem) + insert(4, a, 192, 168, 4, 0, 24); + insert(4, b, 192, 168, 4, 4, 32); + insert(4, c, 192, 168, 0, 0, 16); @@ -5851,6 +6210,8 @@ + insert(6, g, 0x24046800, 0, 0, 0, 32); + insert(6, h, 0x24046800, 0x40040800, 0xdeadbeef, 0xdeadbeef, 64); /* maskself is required */ + insert(6, a, 0x24046800, 0x40040800, 0xdeadbeef, 0xdeadbeef, 128); ++ insert(6, c, 0x24446800, 0x40e40800, 0xdeaebeef, 0xdefbeef, 128); ++ insert(6, b, 0x24446800, 0xf0e40800, 0xeeaebeef, 0, 98); + insert(4, g, 64, 15, 112, 0, 20); + insert(4, h, 64, 15, 123, 211, 25); /* maskself is required */ + insert(4, a, 10, 0, 0, 0, 25); @@ -5859,17 +6220,14 @@ + insert(4, b, 10, 1, 0, 4, 30); + insert(4, c, 10, 1, 0, 8, 29); + insert(4, d, 10, 1, 0, 16, 29); -+#undef insert ++ ++#ifdef DEBUG_PRINT_TRIE_GRAPHVIZ ++ print_tree(t.root4, 32); ++ print_tree(t.root6, 128); ++#endif + + success = true; -+#define test(version, mem, ipa, ipb, ipc, ipd) do { \ -+ bool _s = routing_table_lookup_v##version(&t, ip##version(ipa, ipb, ipc, ipd)) == mem; \ -+ ++i; \ -+ if (!_s) { \ -+ pr_info("routing table self-test %zu: FAIL\n", i); \ -+ success = false; \ -+ } \ -+} while (0) ++ + test(4, a, 192, 168, 4, 20); + test(4, a, 192, 168, 4, 0); + test(4, b, 192, 168, 4, 4); @@ -5898,7 +6256,25 @@ + test(4, b, 10, 1, 0, 6); + test(4, c, 10, 1, 0, 10); + test(4, d, 10, 1, 0, 20); -+#undef test ++ ++ insert(4, a, 1, 0, 0, 0, 32); ++ insert(4, a, 64, 0, 0, 0, 32); ++ insert(4, a, 128, 0, 0, 0, 32); ++ insert(4, a, 192, 0, 0, 0, 32); ++ insert(4, a, 255, 0, 0, 0, 32); ++ routing_table_remove_by_peer(&t, a); ++ test_negative(4, a, 1, 0, 0, 0); ++ test_negative(4, a, 64, 0, 0, 0); ++ test_negative(4, a, 128, 0, 0, 0); ++ test_negative(4, a, 192, 0, 0, 0); ++ test_negative(4, a, 255, 0, 0, 0); ++ ++ routing_table_free(&t); ++ routing_table_init(&t); ++ insert(4, a, 192, 168, 0, 0, 16); ++ insert(4, a, 192, 168, 0, 0, 24); ++ routing_table_remove_by_peer(&t, a); ++ test_negative(4, a, 192, 168, 0, 1); + + /* These will hit the BUG_ON(len >= 128) in free_node if something goes wrong. */ + for (i = 0; i < 128; ++i) { @@ -5908,6 +6284,11 @@ + routing_table_insert_v6(&t, &ip, 128, a); + } + ++#ifdef DEBUG_RANDOM_TRIE ++ if (success) ++ success = randomized_test(); ++#endif ++ + if (success) + pr_info("routing table self-tests: pass\n"); + @@ -5924,102 +6305,16 @@ + + return success; +} ++#undef test_negative ++#undef test ++#undef remove ++#undef insert ++#undef init_peer ++ +#endif ---- /dev/null -+++ b/net/wireguard/selftest/siphash.h -@@ -0,0 +1,89 @@ -+/* Test cases for siphash.c -+ * -+ * Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. -+ * -+ * SipHash: a fast short-input PRF -+ * https://131002.net/siphash/ -+ * -+ * This implementation is specifically for SipHash2-4. -+ */ -+ -+#ifdef DEBUG -+ -+#include -+#include -+#include -+#include -+ -+/* Test vectors taken from official reference source available at: -+ * https://131002.net/siphash/siphash24.c -+ */ -+static const u64 test_vectors[64] = { -+ 0x726fdb47dd0e0e31ULL, 0x74f839c593dc67fdULL, 0x0d6c8009d9a94f5aULL, -+ 0x85676696d7fb7e2dULL, 0xcf2794e0277187b7ULL, 0x18765564cd99a68dULL, -+ 0xcbc9466e58fee3ceULL, 0xab0200f58b01d137ULL, 0x93f5f5799a932462ULL, -+ 0x9e0082df0ba9e4b0ULL, 0x7a5dbbc594ddb9f3ULL, 0xf4b32f46226bada7ULL, -+ 0x751e8fbc860ee5fbULL, 0x14ea5627c0843d90ULL, 0xf723ca908e7af2eeULL, -+ 0xa129ca6149be45e5ULL, 0x3f2acc7f57c29bdbULL, 0x699ae9f52cbe4794ULL, -+ 0x4bc1b3f0968dd39cULL, 0xbb6dc91da77961bdULL, 0xbed65cf21aa2ee98ULL, -+ 0xd0f2cbb02e3b67c7ULL, 0x93536795e3a33e88ULL, 0xa80c038ccd5ccec8ULL, -+ 0xb8ad50c6f649af94ULL, 0xbce192de8a85b8eaULL, 0x17d835b85bbb15f3ULL, -+ 0x2f2e6163076bcfadULL, 0xde4daaaca71dc9a5ULL, 0xa6a2506687956571ULL, -+ 0xad87a3535c49ef28ULL, 0x32d892fad841c342ULL, 0x7127512f72f27cceULL, -+ 0xa7f32346f95978e3ULL, 0x12e0b01abb051238ULL, 0x15e034d40fa197aeULL, -+ 0x314dffbe0815a3b4ULL, 0x027990f029623981ULL, 0xcadcd4e59ef40c4dULL, -+ 0x9abfd8766a33735cULL, 0x0e3ea96b5304a7d0ULL, 0xad0c42d6fc585992ULL, -+ 0x187306c89bc215a9ULL, 0xd4a60abcf3792b95ULL, 0xf935451de4f21df2ULL, -+ 0xa9538f0419755787ULL, 0xdb9acddff56ca510ULL, 0xd06c98cd5c0975ebULL, -+ 0xe612a3cb9ecba951ULL, 0xc766e62cfcadaf96ULL, 0xee64435a9752fe72ULL, -+ 0xa192d576b245165aULL, 0x0a8787bf8ecb74b2ULL, 0x81b3e73d20b49b6fULL, -+ 0x7fa8220ba3b2eceaULL, 0x245731c13ca42499ULL, 0xb78dbfaf3a8d83bdULL, -+ 0xea1ad565322a1a0bULL, 0x60e61c23a3795013ULL, 0x6606d7e446282b93ULL, -+ 0x6ca4ecb15c5f91e1ULL, 0x9f626da15c9625f3ULL, 0xe51b38608ef25f57ULL, -+ 0x958a324ceb064572ULL -+}; -+static const siphash_key_t test_key = -+ { 0x0706050403020100ULL , 0x0f0e0d0c0b0a0908ULL }; -+ -+bool siphash_selftest(void) -+{ -+ u8 in[64] __aligned(SIPHASH_ALIGNMENT); -+ u8 in_unaligned[65]; -+ u8 i; -+ bool ret = true; -+ -+ for (i = 0; i < 64; ++i) { -+ in[i] = i; -+ in_unaligned[i + 1] = i; -+ if (siphash(in, i, test_key) != test_vectors[i]) { -+ pr_info("siphash self-test aligned %u: FAIL\n", i + 1); -+ ret = false; -+ } -+ if (siphash(in_unaligned + 1, i, test_key) != test_vectors[i]) { -+ pr_info("siphash self-test unaligned %u: FAIL\n", i + 1); -+ ret = false; -+ } -+ } -+ if (siphash_1u64(0x0706050403020100ULL, test_key) != test_vectors[8]) { -+ pr_info("siphash self-test 1u64: FAIL\n"); -+ ret = false; -+ } -+ if (siphash_2u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, test_key) != test_vectors[16]) { -+ pr_info("siphash self-test 2u64: FAIL\n"); -+ ret = false; -+ } -+ if (siphash_3u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, -+ 0x1716151413121110ULL, test_key) != test_vectors[24]) { -+ pr_info("siphash self-test 3u64: FAIL\n"); -+ ret = false; -+ } -+ if (siphash_4u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, -+ 0x1716151413121110ULL, 0x1f1e1d1c1b1a1918ULL, test_key) != test_vectors[32]) { -+ pr_info("siphash self-test 4u64: FAIL\n"); -+ ret = false; -+ } -+ if (ret) -+ pr_info("siphash self-tests: pass\n"); -+ return ret; -+} -+#endif ---- /dev/null -+++ b/net/wireguard/crypto/blake2s.c -@@ -0,0 +1,278 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/crypto/blake2s.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,302 @@ +/* Original author: Samuel Neves + * + * Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. @@ -6126,12 +6421,36 @@ + memzero_explicit(block, BLAKE2S_BLOCKBYTES); +} + ++#ifdef CONFIG_X86_64 ++#include ++#include ++#include ++#include ++static bool blake2s_use_avx __read_mostly = false; ++void __init blake2s_fpu_init(void) ++{ ++ blake2s_use_avx = boot_cpu_has(X86_FEATURE_AVX); ++} ++asmlinkage void blake2s_compress_avx(struct blake2s_state *state, const u8 block[BLAKE2S_BLOCKBYTES]); ++#else ++void __init blake2s_fpu_init(void) { } ++#endif ++ +static inline void blake2s_compress(struct blake2s_state *state, const u8 block[BLAKE2S_BLOCKBYTES]) +{ + u32 m[16]; + u32 v[16]; + int i; + ++#ifdef CONFIG_X86_64 ++ if (blake2s_use_avx && irq_fpu_usable()) { ++ kernel_fpu_begin(); ++ blake2s_compress_avx(state, block); ++ kernel_fpu_end(); ++ return; ++ } ++#endif ++ + for (i = 0; i < 16; ++i) + m[i] = le32_to_cpuvp(block + i * sizeof(m[i])); + @@ -6298,9 +6617,9 @@ +} + +#include "../selftest/blake2s.h" ---- /dev/null -+++ b/net/wireguard/crypto/chacha20poly1305.c -@@ -0,0 +1,811 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/crypto/chacha20poly1305.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,886 @@ +/* + * Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. + * Copyright 2015 Martin Willi. @@ -6315,10 +6634,11 @@ +#include +#include + -+#ifdef CONFIG_X86_64 ++#if defined(CONFIG_X86_64) +#include +#include +#ifdef CONFIG_AS_SSSE3 ++asmlinkage void hchacha20_asm_ssse3(u8 *derived_key, const u8 *nonce, const u8 *key); +asmlinkage void chacha20_asm_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); +asmlinkage void chacha20_asm_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); +#endif @@ -6333,14 +6653,28 @@ +static bool chacha20poly1305_use_avx2 __read_mostly = false; +static bool chacha20poly1305_use_ssse3 __read_mostly = false; +static bool chacha20poly1305_use_sse2 __read_mostly = false; -+void chacha20poly1305_init(void) ++void chacha20poly1305_fpu_init(void) +{ + chacha20poly1305_use_sse2 = boot_cpu_has(X86_FEATURE_XMM2); + chacha20poly1305_use_ssse3 = boot_cpu_has(X86_FEATURE_SSSE3); + chacha20poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2); +} ++#elif IS_ENABLED(CONFIG_KERNEL_MODE_NEON) ++#include ++#include ++asmlinkage void chacha20_asm_block_xor_neon(u32 *state, u8 *dst, const u8 *src); ++asmlinkage void chacha20_asm_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); ++static bool chacha20poly1305_use_neon __read_mostly = false; ++void __init chacha20poly1305_fpu_init(void) ++{ ++#if defined(CONFIG_ARM64) ++ chacha20poly1305_use_neon = elf_hwcap & HWCAP_ASIMD; ++#elif defined(CONFIG_ARM) ++ chacha20poly1305_use_neon = elf_hwcap & HWCAP_NEON; ++#endif ++} +#else -+void chacha20poly1305_init(void) { } ++void __init chacha20poly1305_fpu_init(void) { } +#endif + +#define CHACHA20_IV_SIZE 16 @@ -6443,7 +6777,7 @@ + +static const char constant[16] = "expand 32-byte k"; + -+static void hchacha20(u8 derived_key[CHACHA20POLY1305_KEYLEN], const u8 nonce[16], const u8 key[CHACHA20POLY1305_KEYLEN]) ++static void hchacha20_generic(u8 derived_key[CHACHA20POLY1305_KEYLEN], const u8 nonce[16], const u8 key[CHACHA20POLY1305_KEYLEN]) +{ + u32 x[CHACHA20_BLOCK_SIZE / sizeof(u32)]; + __le32 *out = (__force __le32 *)derived_key; @@ -6518,6 +6852,22 @@ + out[7] = cpu_to_le32(x[15]); +} + ++static inline void hchacha20(u8 derived_key[CHACHA20POLY1305_KEYLEN], const u8 nonce[16], const u8 key[CHACHA20POLY1305_KEYLEN], bool have_simd) ++{ ++ if (!have_simd) ++ goto no_simd; ++ ++#if defined(CONFIG_X86_64) && defined(CONFIG_AS_SSSE3) ++ if (chacha20poly1305_use_ssse3) { ++ hchacha20_asm_ssse3(derived_key, nonce, key); ++ return; ++ } ++#endif ++ ++no_simd: ++ hchacha20_generic(derived_key, nonce, key); ++} ++ +static void chacha20_keysetup(struct chacha20_ctx *ctx, const u8 key[CHACHA20_KEY_SIZE], const u8 nonce[sizeof(u64)]) +{ + ctx->state[0] = le32_to_cpuvp(constant + 0); @@ -6543,13 +6893,16 @@ + u8 buf[CHACHA20_BLOCK_SIZE]; + + if (!have_simd -+#ifdef CONFIG_X86_64 ++#if defined(CONFIG_X86_64) + || !chacha20poly1305_use_ssse3 ++ ++#elif IS_ENABLED(CONFIG_KERNEL_MODE_NEON) ++ || !chacha20poly1305_use_neon +#endif + ) + goto no_simd; + -+#ifdef CONFIG_X86_64 ++#if defined(CONFIG_X86_64) +#ifdef CONFIG_AS_AVX2 + if (chacha20poly1305_use_avx2) { + while (bytes >= CHACHA20_BLOCK_SIZE * 8) { @@ -6583,6 +6936,27 @@ + } + return; +#endif ++#elif IS_ENABLED(CONFIG_KERNEL_MODE_NEON) ++ while (bytes >= CHACHA20_BLOCK_SIZE * 4) { ++ chacha20_asm_4block_xor_neon(ctx->state, dst, src); ++ bytes -= CHACHA20_BLOCK_SIZE * 4; ++ src += CHACHA20_BLOCK_SIZE * 4; ++ dst += CHACHA20_BLOCK_SIZE * 4; ++ ctx->state[12] += 4; ++ } ++ while (bytes >= CHACHA20_BLOCK_SIZE) { ++ chacha20_asm_block_xor_neon(ctx->state, dst, src); ++ bytes -= CHACHA20_BLOCK_SIZE; ++ src += CHACHA20_BLOCK_SIZE; ++ dst += CHACHA20_BLOCK_SIZE; ++ ctx->state[12]++; ++ } ++ if (bytes) { ++ memcpy(buf, src, bytes); ++ chacha20_asm_block_xor_neon(ctx->state, buf, buf); ++ memcpy(dst, buf, bytes); ++ } ++ return; +#endif + +no_simd: @@ -6767,7 +7141,6 @@ + + if (ctx->buflen == POLY1305_BLOCK_SIZE) { +#ifdef CONFIG_X86_64 -+ + if (have_simd && chacha20poly1305_use_sse2) + poly1305_simd_blocks(ctx, ctx->buf, POLY1305_BLOCK_SIZE); + else @@ -6779,7 +7152,6 @@ + + if (likely(srclen >= POLY1305_BLOCK_SIZE)) { +#ifdef CONFIG_X86_64 -+ + if (have_simd && chacha20poly1305_use_sse2) + bytes = poly1305_simd_blocks(ctx, src, srclen); + else @@ -6871,16 +7243,16 @@ + .tfm = &chacha20_cipher +}; + -+bool chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, -+ const u8 *ad, const size_t ad_len, -+ const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN]) ++static inline void __chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, ++ const u8 *ad, const size_t ad_len, ++ const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN], ++ bool have_simd) +{ + struct poly1305_ctx poly1305_state; + struct chacha20_ctx chacha20_state; + u8 block0[CHACHA20_BLOCK_SIZE] = { 0 }; + __le64 len; + __le64 le_nonce = cpu_to_le64(nonce); -+ bool have_simd = chacha20poly1305_init_simd(); + + chacha20_keysetup(&chacha20_state, key, (u8 *)&le_nonce); + @@ -6906,10 +7278,16 @@ + + memzero_explicit(&poly1305_state, sizeof(poly1305_state)); + memzero_explicit(&chacha20_state, sizeof(chacha20_state)); ++} + ++void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, ++ const u8 *ad, const size_t ad_len, ++ const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN]) ++{ ++ bool have_simd; ++ have_simd = chacha20poly1305_init_simd(); ++ __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, nonce, key, have_simd); + chacha20poly1305_deinit_simd(have_simd); -+ -+ return true; +} + +bool chacha20poly1305_encrypt_sg(struct scatterlist *dst, struct scatterlist *src, const size_t src_len, @@ -6919,6 +7297,7 @@ +{ + struct poly1305_ctx poly1305_state; + struct chacha20_ctx chacha20_state; ++ int ret = 0; + struct blkcipher_walk walk; + u8 block0[CHACHA20_BLOCK_SIZE] = { 0 }; + u8 mac[POLY1305_MAC_SIZE]; @@ -6936,19 +7315,21 @@ + + if (likely(src_len)) { + blkcipher_walk_init(&walk, dst, src, src_len); -+ blkcipher_walk_virt_block(&chacha20_desc, &walk, CHACHA20_BLOCK_SIZE); ++ ret = blkcipher_walk_virt_block(&chacha20_desc, &walk, CHACHA20_BLOCK_SIZE); + while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { + size_t chunk_len = rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE); + chacha20_crypt(&chacha20_state, walk.dst.virt.addr, walk.src.virt.addr, chunk_len, have_simd); + poly1305_update(&poly1305_state, walk.dst.virt.addr, chunk_len, have_simd); -+ blkcipher_walk_done(&chacha20_desc, &walk, walk.nbytes % CHACHA20_BLOCK_SIZE); ++ ret = blkcipher_walk_done(&chacha20_desc, &walk, walk.nbytes % CHACHA20_BLOCK_SIZE); + } + if (walk.nbytes) { + chacha20_crypt(&chacha20_state, walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes, have_simd); + poly1305_update(&poly1305_state, walk.dst.virt.addr, walk.nbytes, have_simd); -+ blkcipher_walk_done(&chacha20_desc, &walk, 0); ++ ret = blkcipher_walk_done(&chacha20_desc, &walk, 0); + } + } ++ if (unlikely(ret)) ++ goto err; + + poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf, have_simd); + @@ -6960,15 +7341,17 @@ + + poly1305_finish(&poly1305_state, mac); + scatterwalk_map_and_copy(mac, dst, src_len, sizeof(mac), 1); ++err: + memzero_explicit(&poly1305_state, sizeof(poly1305_state)); + memzero_explicit(&chacha20_state, sizeof(chacha20_state)); + memzero_explicit(mac, sizeof(mac)); -+ return true; ++ return !ret; +} + -+bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, -+ const u8 *ad, const size_t ad_len, -+ const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN]) ++static inline bool __chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, ++ const u8 *ad, const size_t ad_len, ++ const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN], ++ bool have_simd) +{ + struct poly1305_ctx poly1305_state; + struct chacha20_ctx chacha20_state; @@ -6978,13 +7361,10 @@ + size_t dst_len; + __le64 len; + __le64 le_nonce = cpu_to_le64(nonce); -+ bool have_simd; + + if (unlikely(src_len < POLY1305_MAC_SIZE)) + return false; + -+ have_simd = chacha20poly1305_init_simd(); -+ + chacha20_keysetup(&chacha20_state, key, (u8 *)&le_nonce); + + chacha20_crypt(&chacha20_state, block0, block0, sizeof(block0), have_simd); @@ -7014,10 +7394,20 @@ + + memzero_explicit(&chacha20_state, sizeof(chacha20_state)); + -+ chacha20poly1305_deinit_simd(have_simd); + return !ret; +} + ++bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, ++ const u8 *ad, const size_t ad_len, ++ const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN]) ++{ ++ bool have_simd, ret; ++ have_simd = chacha20poly1305_init_simd(); ++ ret = __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, nonce, key, have_simd); ++ chacha20poly1305_deinit_simd(have_simd); ++ return ret; ++} ++ +bool chacha20poly1305_decrypt_sg(struct scatterlist *dst, struct scatterlist *src, const size_t src_len, + const u8 *ad, const size_t ad_len, + const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN]) @@ -7025,7 +7415,7 @@ + struct poly1305_ctx poly1305_state; + struct chacha20_ctx chacha20_state; + struct blkcipher_walk walk; -+ int ret; ++ int ret = 0; + u8 block0[CHACHA20_BLOCK_SIZE] = { 0 }; + u8 read_mac[POLY1305_MAC_SIZE], computed_mac[POLY1305_MAC_SIZE]; + size_t dst_len; @@ -7050,19 +7440,21 @@ + dst_len = src_len - POLY1305_MAC_SIZE; + if (likely(dst_len)) { + blkcipher_walk_init(&walk, dst, src, dst_len); -+ blkcipher_walk_virt_block(&chacha20_desc, &walk, CHACHA20_BLOCK_SIZE); ++ ret = blkcipher_walk_virt_block(&chacha20_desc, &walk, CHACHA20_BLOCK_SIZE); + while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { + size_t chunk_len = rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE); + poly1305_update(&poly1305_state, walk.src.virt.addr, chunk_len, have_simd); + chacha20_crypt(&chacha20_state, walk.dst.virt.addr, walk.src.virt.addr, chunk_len, have_simd); -+ blkcipher_walk_done(&chacha20_desc, &walk, walk.nbytes % CHACHA20_BLOCK_SIZE); ++ ret = blkcipher_walk_done(&chacha20_desc, &walk, walk.nbytes % CHACHA20_BLOCK_SIZE); + } + if (walk.nbytes) { + poly1305_update(&poly1305_state, walk.src.virt.addr, walk.nbytes, have_simd); + chacha20_crypt(&chacha20_state, walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes, have_simd); -+ blkcipher_walk_done(&chacha20_desc, &walk, 0); ++ ret = blkcipher_walk_done(&chacha20_desc, &walk, 0); + } + } ++ if (unlikely(ret)) ++ goto err; + + poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf, have_simd); + @@ -7077,6 +7469,7 @@ + + scatterwalk_map_and_copy(read_mac, src, dst_len, POLY1305_MAC_SIZE, 0); + ret = crypto_memneq(read_mac, computed_mac, POLY1305_MAC_SIZE); ++err: + memzero_explicit(read_mac, POLY1305_MAC_SIZE); + memzero_explicit(computed_mac, POLY1305_MAC_SIZE); + memzero_explicit(&chacha20_state, sizeof(chacha20_state)); @@ -7085,17 +7478,17 @@ +} + + -+bool xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, ++void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, + const u8 *ad, const size_t ad_len, + const u8 nonce[XCHACHA20POLY1305_NONCELEN], + const u8 key[CHACHA20POLY1305_KEYLEN]) +{ -+ u8 derived_key[CHACHA20POLY1305_KEYLEN]; -+ bool ret; -+ hchacha20(derived_key, nonce, key); -+ ret = chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, le64_to_cpuvp(nonce + 16), derived_key); ++ bool have_simd = chacha20poly1305_init_simd(); ++ u8 derived_key[CHACHA20POLY1305_KEYLEN] __aligned(16); ++ hchacha20(derived_key, nonce, key, have_simd); ++ __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, le64_to_cpuvp(nonce + 16), derived_key, have_simd); + memzero_explicit(derived_key, CHACHA20POLY1305_KEYLEN); -+ return ret; ++ chacha20poly1305_deinit_simd(have_simd); +} + +bool xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, @@ -7103,18 +7496,19 @@ + const u8 nonce[XCHACHA20POLY1305_NONCELEN], + const u8 key[CHACHA20POLY1305_KEYLEN]) +{ -+ u8 derived_key[CHACHA20POLY1305_KEYLEN]; -+ bool ret; -+ hchacha20(derived_key, nonce, key); -+ ret = chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, le64_to_cpuvp(nonce + 16), derived_key); ++ bool ret, have_simd = chacha20poly1305_init_simd(); ++ u8 derived_key[CHACHA20POLY1305_KEYLEN] __aligned(16); ++ hchacha20(derived_key, nonce, key, have_simd); ++ ret = __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, le64_to_cpuvp(nonce + 16), derived_key, have_simd); + memzero_explicit(derived_key, CHACHA20POLY1305_KEYLEN); ++ chacha20poly1305_deinit_simd(have_simd); + return ret; +} + +#include "../selftest/chacha20poly1305.h" ---- /dev/null -+++ b/net/wireguard/crypto/curve25519.c -@@ -0,0 +1,1240 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/crypto/curve25519.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,1631 @@ +/* Original author: Adam Langley + * + * Copyright 2008 Google Inc. All Rights Reserved. @@ -7127,17 +7521,203 @@ +#include +#include + ++#define ARCH_HAS_SEPARATE_IRQ_STACK ++ ++#if (defined(CONFIG_MIPS) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0)) || defined(CONFIG_ARM) ++#undef ARCH_HAS_SEPARATE_IRQ_STACK ++#endif ++ +static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) +{ + secret[0] &= 248; + secret[31] &= 127; + secret[31] |= 64; +} ++static const u8 null_point[CURVE25519_POINT_SIZE] = { 0 }; ++ ++#if defined(CONFIG_X86_64) ++#include ++#include ++#include ++#include ++static bool curve25519_use_avx __read_mostly = false; ++void curve25519_fpu_init(void) ++{ ++ curve25519_use_avx = boot_cpu_has(X86_FEATURE_AVX); ++} ++ ++typedef u64 fe[10]; ++typedef u64 fe51[5]; ++asmlinkage void curve25519_sandy2x_ladder(fe *, const u8 *); ++asmlinkage void curve25519_sandy2x_ladder_base(fe *, const u8 *); ++asmlinkage void curve25519_sandy2x_fe51_pack(u8 *, const fe51 *); ++asmlinkage void curve25519_sandy2x_fe51_mul(fe51 *, const fe51 *, const fe51 *); ++asmlinkage void curve25519_sandy2x_fe51_nsquare(fe51 *, const fe51 *, int); ++ ++static inline u32 le24_to_cpupv(const u8 *in) ++{ ++ return le16_to_cpup((__le16 *)in) | ((u32)in[2]) << 16; ++} ++ ++static inline void fe_frombytes(fe h, const u8 *s) ++{ ++ u64 h0 = le32_to_cpup((__le32 *)s); ++ u64 h1 = le24_to_cpupv(s + 4) << 6; ++ u64 h2 = le24_to_cpupv(s + 7) << 5; ++ u64 h3 = le24_to_cpupv(s + 10) << 3; ++ u64 h4 = le24_to_cpupv(s + 13) << 2; ++ u64 h5 = le32_to_cpup((__le32 *)(s + 16)); ++ u64 h6 = le24_to_cpupv(s + 20) << 7; ++ u64 h7 = le24_to_cpupv(s + 23) << 5; ++ u64 h8 = le24_to_cpupv(s + 26) << 4; ++ u64 h9 = (le24_to_cpupv(s + 29) & 8388607) << 2; ++ u64 carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9; ++ ++ carry9 = h9 >> 25; h0 += carry9 * 19; h9 &= 0x1FFFFFF; ++ carry1 = h1 >> 25; h2 += carry1; h1 &= 0x1FFFFFF; ++ carry3 = h3 >> 25; h4 += carry3; h3 &= 0x1FFFFFF; ++ carry5 = h5 >> 25; h6 += carry5; h5 &= 0x1FFFFFF; ++ carry7 = h7 >> 25; h8 += carry7; h7 &= 0x1FFFFFF; ++ ++ carry0 = h0 >> 26; h1 += carry0; h0 &= 0x3FFFFFF; ++ carry2 = h2 >> 26; h3 += carry2; h2 &= 0x3FFFFFF; ++ carry4 = h4 >> 26; h5 += carry4; h4 &= 0x3FFFFFF; ++ carry6 = h6 >> 26; h7 += carry6; h6 &= 0x3FFFFFF; ++ carry8 = h8 >> 26; h9 += carry8; h8 &= 0x3FFFFFF; ++ ++ h[0] = h0; ++ h[1] = h1; ++ h[2] = h2; ++ h[3] = h3; ++ h[4] = h4; ++ h[5] = h5; ++ h[6] = h6; ++ h[7] = h7; ++ h[8] = h8; ++ h[9] = h9; ++} ++ ++static inline void fe51_invert(fe51 *r, const fe51 *x) ++{ ++ fe51 z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t; ++ ++ /* 2 */ curve25519_sandy2x_fe51_nsquare(&z2, x, 1); ++ /* 4 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2, 1); ++ /* 8 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 1); ++ /* 9 */ curve25519_sandy2x_fe51_mul(&z9, (const fe51 *)&t, x); ++ /* 11 */ curve25519_sandy2x_fe51_mul(&z11, (const fe51 *)&z9, (const fe51 *)&z2); ++ /* 22 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z11, 1); ++ /* 2^5 - 2^0 = 31 */ curve25519_sandy2x_fe51_mul(&z2_5_0, (const fe51 *)&t, (const fe51 *)&z9); ++ ++ /* 2^10 - 2^5 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_5_0, 5); ++ /* 2^10 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_10_0, (const fe51 *)&t, (const fe51 *)&z2_5_0); ++ ++ /* 2^20 - 2^10 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_10_0, 10); ++ /* 2^20 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_20_0, (const fe51 *)&t, (const fe51 *)&z2_10_0); ++ ++ /* 2^40 - 2^20 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_20_0, 20); ++ /* 2^40 - 2^0 */ curve25519_sandy2x_fe51_mul(&t, (const fe51 *)&t, (const fe51 *)&z2_20_0); ++ ++ /* 2^50 - 2^10 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 10); ++ /* 2^50 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_50_0, (const fe51 *)&t, (const fe51 *)&z2_10_0); ++ ++ /* 2^100 - 2^50 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_50_0, 50); ++ /* 2^100 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_100_0, (const fe51 *)&t, (const fe51 *)&z2_50_0); ++ ++ /* 2^200 - 2^100 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_100_0, 100); ++ /* 2^200 - 2^0 */ curve25519_sandy2x_fe51_mul(&t, (const fe51 *)&t, (const fe51 *)&z2_100_0); ++ ++ /* 2^250 - 2^50 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 50); ++ /* 2^250 - 2^0 */ curve25519_sandy2x_fe51_mul(&t, (const fe51 *)&t, (const fe51 *)&z2_50_0); ++ ++ /* 2^255 - 2^5 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 5); ++ /* 2^255 - 21 */ curve25519_sandy2x_fe51_mul(r, (const fe51 *)t, (const fe51 *)&z11); ++} ++ ++static void curve25519_sandy2x(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]) ++{ ++ u8 e[32]; ++ fe var[3]; ++ fe51 x_51, z_51; ++ memcpy(e, secret, 32); ++ normalize_secret(e); ++#define x1 var[0] ++#define x2 var[1] ++#define z2 var[2] ++ fe_frombytes(x1, basepoint); ++ curve25519_sandy2x_ladder(var, e); ++ z_51[0] = (z2[1] << 26) + z2[0]; ++ z_51[1] = (z2[3] << 26) + z2[2]; ++ z_51[2] = (z2[5] << 26) + z2[4]; ++ z_51[3] = (z2[7] << 26) + z2[6]; ++ z_51[4] = (z2[9] << 26) + z2[8]; ++ x_51[0] = (x2[1] << 26) + x2[0]; ++ x_51[1] = (x2[3] << 26) + x2[2]; ++ x_51[2] = (x2[5] << 26) + x2[4]; ++ x_51[3] = (x2[7] << 26) + x2[6]; ++ x_51[4] = (x2[9] << 26) + x2[8]; ++#undef x1 ++#undef x2 ++#undef z2 ++ fe51_invert(&z_51, (const fe51 *)&z_51); ++ curve25519_sandy2x_fe51_mul(&x_51, (const fe51 *)&x_51, (const fe51 *)&z_51); ++ curve25519_sandy2x_fe51_pack(mypublic, (const fe51 *)&x_51); ++ ++ memzero_explicit(e, sizeof(e)); ++ memzero_explicit(var, sizeof(var)); ++ memzero_explicit(x_51, sizeof(x_51)); ++ memzero_explicit(z_51, sizeof(z_51)); ++} ++ ++static void curve25519_sandy2x_base(u8 pub[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE]) ++{ ++ u8 e[32]; ++ fe var[3]; ++ fe51 x_51, z_51; ++ memcpy(e, secret, 32); ++ normalize_secret(e); ++ curve25519_sandy2x_ladder_base(var, e); ++#define x2 var[0] ++#define z2 var[1] ++ z_51[0] = (z2[1] << 26) + z2[0]; ++ z_51[1] = (z2[3] << 26) + z2[2]; ++ z_51[2] = (z2[5] << 26) + z2[4]; ++ z_51[3] = (z2[7] << 26) + z2[6]; ++ z_51[4] = (z2[9] << 26) + z2[8]; ++ x_51[0] = (x2[1] << 26) + x2[0]; ++ x_51[1] = (x2[3] << 26) + x2[2]; ++ x_51[2] = (x2[5] << 26) + x2[4]; ++ x_51[3] = (x2[7] << 26) + x2[6]; ++ x_51[4] = (x2[9] << 26) + x2[8]; ++#undef x2 ++#undef z2 ++ fe51_invert(&z_51, (const fe51 *)&z_51); ++ curve25519_sandy2x_fe51_mul(&x_51, (const fe51 *)&x_51, (const fe51 *)&z_51); ++ curve25519_sandy2x_fe51_pack(pub, (const fe51 *)&x_51); ++ ++ memzero_explicit(e, sizeof(e)); ++ memzero_explicit(var, sizeof(var)); ++ memzero_explicit(x_51, sizeof(x_51)); ++ memzero_explicit(z_51, sizeof(z_51)); ++} ++#elif IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && defined(CONFIG_ARM) ++#include ++#include ++#include ++asmlinkage void curve25519_asm_neon(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]); ++static bool curve25519_use_neon __read_mostly = false; ++void __init curve25519_fpu_init(void) ++{ ++ curve25519_use_neon = elf_hwcap & HWCAP_NEON; ++} ++#else ++void __init curve25519_fpu_init(void) { } ++#endif + +#ifdef __SIZEOF_INT128__ +typedef u64 limb; +typedef limb felem[5]; -+typedef __uint128_t uint128_t; ++typedef __uint128_t u128; + +/* Sum two numbers: output += in */ +static __always_inline void fsum(limb *output, const limb *in) @@ -7171,21 +7751,21 @@ +/* Multiply a number by a scalar: output = in * scalar */ +static __always_inline void fscalar_product(felem output, const felem in, const limb scalar) +{ -+ uint128_t a; ++ u128 a; + -+ a = ((uint128_t) in[0]) * scalar; ++ a = ((u128) in[0]) * scalar; + output[0] = ((limb)a) & 0x7ffffffffffffUL; + -+ a = ((uint128_t) in[1]) * scalar + ((limb) (a >> 51)); ++ a = ((u128) in[1]) * scalar + ((limb) (a >> 51)); + output[1] = ((limb)a) & 0x7ffffffffffffUL; + -+ a = ((uint128_t) in[2]) * scalar + ((limb) (a >> 51)); ++ a = ((u128) in[2]) * scalar + ((limb) (a >> 51)); + output[2] = ((limb)a) & 0x7ffffffffffffUL; + -+ a = ((uint128_t) in[3]) * scalar + ((limb) (a >> 51)); ++ a = ((u128) in[3]) * scalar + ((limb) (a >> 51)); + output[3] = ((limb)a) & 0x7ffffffffffffUL; + -+ a = ((uint128_t) in[4]) * scalar + ((limb) (a >> 51)); ++ a = ((u128) in[4]) * scalar + ((limb) (a >> 51)); + output[4] = ((limb)a) & 0x7ffffffffffffUL; + + output[0] += (a >> 51) * 19; @@ -7201,7 +7781,7 @@ + */ +static __always_inline void fmul(felem output, const felem in2, const felem in) +{ -+ uint128_t t[5]; ++ u128 t[5]; + limb r0,r1,r2,r3,r4,s0,s1,s2,s3,s4,c; + + r0 = in[0]; @@ -7216,21 +7796,21 @@ + s3 = in2[3]; + s4 = in2[4]; + -+ t[0] = ((uint128_t) r0) * s0; -+ t[1] = ((uint128_t) r0) * s1 + ((uint128_t) r1) * s0; -+ t[2] = ((uint128_t) r0) * s2 + ((uint128_t) r2) * s0 + ((uint128_t) r1) * s1; -+ t[3] = ((uint128_t) r0) * s3 + ((uint128_t) r3) * s0 + ((uint128_t) r1) * s2 + ((uint128_t) r2) * s1; -+ t[4] = ((uint128_t) r0) * s4 + ((uint128_t) r4) * s0 + ((uint128_t) r3) * s1 + ((uint128_t) r1) * s3 + ((uint128_t) r2) * s2; ++ t[0] = ((u128) r0) * s0; ++ t[1] = ((u128) r0) * s1 + ((u128) r1) * s0; ++ t[2] = ((u128) r0) * s2 + ((u128) r2) * s0 + ((u128) r1) * s1; ++ t[3] = ((u128) r0) * s3 + ((u128) r3) * s0 + ((u128) r1) * s2 + ((u128) r2) * s1; ++ t[4] = ((u128) r0) * s4 + ((u128) r4) * s0 + ((u128) r3) * s1 + ((u128) r1) * s3 + ((u128) r2) * s2; + + r4 *= 19; + r1 *= 19; + r2 *= 19; + r3 *= 19; + -+ t[0] += ((uint128_t) r4) * s1 + ((uint128_t) r1) * s4 + ((uint128_t) r2) * s3 + ((uint128_t) r3) * s2; -+ t[1] += ((uint128_t) r4) * s2 + ((uint128_t) r2) * s4 + ((uint128_t) r3) * s3; -+ t[2] += ((uint128_t) r4) * s3 + ((uint128_t) r3) * s4; -+ t[3] += ((uint128_t) r4) * s4; ++ t[0] += ((u128) r4) * s1 + ((u128) r1) * s4 + ((u128) r2) * s3 + ((u128) r3) * s2; ++ t[1] += ((u128) r4) * s2 + ((u128) r2) * s4 + ((u128) r3) * s3; ++ t[2] += ((u128) r4) * s3 + ((u128) r3) * s4; ++ t[3] += ((u128) r4) * s4; + + r0 = (limb)t[0] & 0x7ffffffffffffUL; c = (limb)(t[0] >> 51); + t[1] += c; r1 = (limb)t[1] & 0x7ffffffffffffUL; c = (limb)(t[1] >> 51); @@ -7250,7 +7830,7 @@ + +static __always_inline void fsquare_times(felem output, const felem in, limb count) +{ -+ uint128_t t[5]; ++ u128 t[5]; + limb r0,r1,r2,r3,r4,c; + limb d0,d1,d2,d4,d419; + @@ -7267,11 +7847,11 @@ + d419 = r4 * 19; + d4 = d419 * 2; + -+ t[0] = ((uint128_t) r0) * r0 + ((uint128_t) d4) * r1 + (((uint128_t) d2) * (r3 )); -+ t[1] = ((uint128_t) d0) * r1 + ((uint128_t) d4) * r2 + (((uint128_t) r3) * (r3 * 19)); -+ t[2] = ((uint128_t) d0) * r2 + ((uint128_t) r1) * r1 + (((uint128_t) d4) * (r3 )); -+ t[3] = ((uint128_t) d0) * r3 + ((uint128_t) d1) * r2 + (((uint128_t) r4) * (d419 )); -+ t[4] = ((uint128_t) d0) * r4 + ((uint128_t) d1) * r3 + (((uint128_t) r2) * (r2 )); ++ t[0] = ((u128) r0) * r0 + ((u128) d4) * r1 + (((u128) d2) * (r3 )); ++ t[1] = ((u128) d0) * r1 + ((u128) d4) * r2 + (((u128) r3) * (r3 * 19)); ++ t[2] = ((u128) d0) * r2 + ((u128) r1) * r1 + (((u128) d4) * (r3 )); ++ t[3] = ((u128) d0) * r3 + ((u128) d1) * r2 + (((u128) r4) * (d419 )); ++ t[4] = ((u128) d0) * r4 + ((u128) d1) * r3 + (((u128) r2) * (r2 )); + + r0 = (limb)t[0] & 0x7ffffffffffffUL; c = (limb)(t[0] >> 51); + t[1] += c; r1 = (limb)t[1] & 0x7ffffffffffffUL; c = (limb)(t[1] >> 51); @@ -7293,12 +7873,12 @@ +/* Load a little-endian 64-bit number */ +static inline limb load_limb(const u8 *in) +{ -+ return le64_to_cpu(*(u64 *)in); ++ return le64_to_cpu(*(__le64 *)in); +} + +static inline void store_limb(u8 *out, limb in) +{ -+ *(u64 *)out = cpu_to_le64(in); ++ *(__le64 *)out = cpu_to_le64(in); +} + +/* Take a little-endian, 32-byte number and expand it into polynomial form */ @@ -7316,7 +7896,7 @@ + */ +static void fcontract(u8 *output, const felem input) +{ -+ uint128_t t[5]; ++ u128 t[5]; + + t[0] = input[0]; + t[1] = input[1]; @@ -7510,27 +8090,50 @@ + /* 2^255 - 21 */ fmul(out, t0, a); +} + -+void curve25519(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]) ++bool curve25519(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]) +{ -+ limb bp[5], x[5], z[5], zmone[5]; -+ u8 e[32]; ++#ifdef CONFIG_X86_64 ++ if (curve25519_use_avx && irq_fpu_usable()) { ++ kernel_fpu_begin(); ++ curve25519_sandy2x(mypublic, secret, basepoint); ++ kernel_fpu_end(); ++ } else ++#endif ++ { ++ limb bp[5], x[5], z[5], zmone[5]; ++ u8 e[32]; + -+ memcpy(e, secret, 32); -+ normalize_secret(e); ++ memcpy(e, secret, 32); ++ normalize_secret(e); + -+ fexpand(bp, basepoint); -+ cmult(x, z, e, bp); -+ crecip(zmone, z); -+ fmul(z, x, zmone); -+ fcontract(mypublic, z); ++ fexpand(bp, basepoint); ++ cmult(x, z, e, bp); ++ crecip(zmone, z); ++ fmul(z, x, zmone); ++ fcontract(mypublic, z); + -+ memzero_explicit(e, sizeof(e)); -+ memzero_explicit(bp, sizeof(bp)); -+ memzero_explicit(x, sizeof(x)); -+ memzero_explicit(z, sizeof(z)); -+ memzero_explicit(zmone, sizeof(zmone)); ++ memzero_explicit(e, sizeof(e)); ++ memzero_explicit(bp, sizeof(bp)); ++ memzero_explicit(x, sizeof(x)); ++ memzero_explicit(z, sizeof(z)); ++ memzero_explicit(zmone, sizeof(zmone)); ++ } ++ return crypto_memneq(mypublic, null_point, CURVE25519_POINT_SIZE); +} + ++bool curve25519_generate_public(u8 pub[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE]) ++{ ++ static const u8 basepoint[CURVE25519_POINT_SIZE] __aligned(32) = { 9 }; ++#ifdef CONFIG_X86_64 ++ if (curve25519_use_avx && irq_fpu_usable()) { ++ kernel_fpu_begin(); ++ curve25519_sandy2x_base(pub, secret); ++ kernel_fpu_end(); ++ return crypto_memneq(pub, null_point, CURVE25519_POINT_SIZE); ++ } ++#endif ++ return curve25519(pub, secret, basepoint); ++} +#else +typedef s64 limb; + @@ -8181,6 +8784,7 @@ +} + + ++#ifdef ARCH_HAS_SEPARATE_IRQ_STACK +/* Input: Q, Q', Q-Q' + * Output: 2Q, Q+Q' + * @@ -8320,44 +8924,225 @@ + memcpy(resultz, nqz, sizeof(limb) * 10); +} + -+void curve25519(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]) ++bool curve25519(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]) +{ ++#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && defined(CONFIG_ARM) ++ if (curve25519_use_neon && may_use_simd()) { ++ kernel_neon_begin(); ++ curve25519_asm_neon(mypublic, secret, basepoint); ++ kernel_neon_end(); ++ } else ++#endif ++ { ++ limb bp[10], x[10], z[11], zmone[10]; ++ u8 e[32]; ++ ++ memcpy(e, secret, 32); ++ normalize_secret(e); ++ ++ fexpand(bp, basepoint); ++ cmult(x, z, e, bp); ++ crecip(zmone, z); ++ fmul(z, x, zmone); ++ fcontract(mypublic, z); ++ ++ memzero_explicit(e, sizeof(e)); ++ memzero_explicit(bp, sizeof(bp)); ++ memzero_explicit(x, sizeof(x)); ++ memzero_explicit(z, sizeof(z)); ++ memzero_explicit(zmone, sizeof(zmone)); ++ } ++ return crypto_memneq(mypublic, null_point, CURVE25519_POINT_SIZE); ++} ++#else ++struct other_stack { ++ limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19], zzprime[19], zzzprime[19], xxxprime[19]; ++ limb a[19], b[19], c[19], d[19], e[19], f[19], g[19], h[19]; + limb bp[10], x[10], z[11], zmone[10]; -+ u8 e[32]; ++ u8 ee[32]; ++}; + -+ memcpy(e, secret, 32); -+ normalize_secret(e); ++/* Input: Q, Q', Q-Q' ++ * Output: 2Q, Q+Q' ++ * ++ * x2 z3: long form ++ * x3 z3: long form ++ * x z: short form, destroyed ++ * xprime zprime: short form, destroyed ++ * qmqp: short form, preserved ++ * ++ * On entry and exit, the absolute value of the limbs of all inputs and outputs ++ * are < 2^26. */ ++static void fmonty(struct other_stack *s, ++ limb *x2, limb *z2, /* output 2Q */ ++ limb *x3, limb *z3, /* output Q + Q' */ ++ limb *x, limb *z, /* input Q */ ++ limb *xprime, limb *zprime, /* input Q' */ ++ const limb *qmqp /* input Q - Q' */) ++{ ++ memcpy(s->origx, x, 10 * sizeof(limb)); ++ fsum(x, z); ++ /* |x[i]| < 2^27 */ ++ fdifference(z, s->origx); /* does x - z */ ++ /* |z[i]| < 2^27 */ + -+ fexpand(bp, basepoint); -+ cmult(x, z, e, bp); -+ crecip(zmone, z); -+ fmul(z, x, zmone); -+ fcontract(mypublic, z); ++ memcpy(s->origxprime, xprime, sizeof(limb) * 10); ++ fsum(xprime, zprime); ++ /* |xprime[i]| < 2^27 */ ++ fdifference(zprime, s->origxprime); ++ /* |zprime[i]| < 2^27 */ ++ fproduct(s->xxprime, xprime, z); ++ /* |s->xxprime[i]| < 14*2^54: the largest product of two limbs will be < ++ * 2^(27+27) and fproduct adds together, at most, 14 of those products. ++ * (Approximating that to 2^58 doesn't work out.) */ ++ fproduct(s->zzprime, x, zprime); ++ /* |s->zzprime[i]| < 14*2^54 */ ++ freduce_degree(s->xxprime); ++ freduce_coefficients(s->xxprime); ++ /* |s->xxprime[i]| < 2^26 */ ++ freduce_degree(s->zzprime); ++ freduce_coefficients(s->zzprime); ++ /* |s->zzprime[i]| < 2^26 */ ++ memcpy(s->origxprime, s->xxprime, sizeof(limb) * 10); ++ fsum(s->xxprime, s->zzprime); ++ /* |s->xxprime[i]| < 2^27 */ ++ fdifference(s->zzprime, s->origxprime); ++ /* |s->zzprime[i]| < 2^27 */ ++ fsquare(s->xxxprime, s->xxprime); ++ /* |s->xxxprime[i]| < 2^26 */ ++ fsquare(s->zzzprime, s->zzprime); ++ /* |s->zzzprime[i]| < 2^26 */ ++ fproduct(s->zzprime, s->zzzprime, qmqp); ++ /* |s->zzprime[i]| < 14*2^52 */ ++ freduce_degree(s->zzprime); ++ freduce_coefficients(s->zzprime); ++ /* |s->zzprime[i]| < 2^26 */ ++ memcpy(x3, s->xxxprime, sizeof(limb) * 10); ++ memcpy(z3, s->zzprime, sizeof(limb) * 10); + -+ memzero_explicit(e, sizeof(e)); -+ memzero_explicit(bp, sizeof(bp)); -+ memzero_explicit(x, sizeof(x)); -+ memzero_explicit(z, sizeof(z)); -+ memzero_explicit(zmone, sizeof(zmone)); ++ fsquare(s->xx, x); ++ /* |s->xx[i]| < 2^26 */ ++ fsquare(s->zz, z); ++ /* |s->zz[i]| < 2^26 */ ++ fproduct(x2, s->xx, s->zz); ++ /* |x2[i]| < 14*2^52 */ ++ freduce_degree(x2); ++ freduce_coefficients(x2); ++ /* |x2[i]| < 2^26 */ ++ fdifference(s->zz, s->xx); // does s->zz = s->xx - s->zz ++ /* |s->zz[i]| < 2^27 */ ++ memset(s->zzz + 10, 0, sizeof(limb) * 9); ++ fscalar_product(s->zzz, s->zz, 121665); ++ /* |s->zzz[i]| < 2^(27+17) */ ++ /* No need to call freduce_degree here: ++ fscalar_product doesn't increase the degree of its input. */ ++ freduce_coefficients(s->zzz); ++ /* |s->zzz[i]| < 2^26 */ ++ fsum(s->zzz, s->xx); ++ /* |s->zzz[i]| < 2^27 */ ++ fproduct(z2, s->zz, s->zzz); ++ /* |z2[i]| < 14*2^(26+27) */ ++ freduce_degree(z2); ++ freduce_coefficients(z2); ++ /* |z2|i| < 2^26 */ ++} ++ ++/* Calculates nQ where Q is the x-coordinate of a point on the curve ++ * ++ * resultx/resultz: the x coordinate of the resulting curve point (short form) ++ * n: a little endian, 32-byte number ++ * q: a point of the curve (short form) */ ++static void cmult(struct other_stack *s, limb *resultx, limb *resultz, const u8 *n, const limb *q) ++{ ++ unsigned i, j; ++ limb *nqpqx = s->a, *nqpqz = s->b, *nqx = s->c, *nqz = s->d, *t; ++ limb *nqpqx2 = s->e, *nqpqz2 = s->f, *nqx2 = s->g, *nqz2 = s->h; ++ ++ *nqpqz = *nqx = *nqpqz2 = *nqz2 = 1; ++ memcpy(nqpqx, q, sizeof(limb) * 10); ++ ++ for (i = 0; i < 32; ++i) { ++ u8 byte = n[31 - i]; ++ for (j = 0; j < 8; ++j) { ++ const limb bit = byte >> 7; ++ ++ swap_conditional(nqx, nqpqx, bit); ++ swap_conditional(nqz, nqpqz, bit); ++ fmonty(s, ++ nqx2, nqz2, ++ nqpqx2, nqpqz2, ++ nqx, nqz, ++ nqpqx, nqpqz, ++ q); ++ swap_conditional(nqx2, nqpqx2, bit); ++ swap_conditional(nqz2, nqpqz2, bit); ++ ++ t = nqx; ++ nqx = nqx2; ++ nqx2 = t; ++ t = nqz; ++ nqz = nqz2; ++ nqz2 = t; ++ t = nqpqx; ++ nqpqx = nqpqx2; ++ nqpqx2 = t; ++ t = nqpqz; ++ nqpqz = nqpqz2; ++ nqpqz2 = t; ++ ++ byte <<= 1; ++ } ++ } ++ ++ memcpy(resultx, nqx, sizeof(limb) * 10); ++ memcpy(resultz, nqz, sizeof(limb) * 10); ++} ++ ++bool curve25519(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]) ++{ ++#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && defined(CONFIG_ARM) ++ if (curve25519_use_neon && may_use_simd()) { ++ kernel_neon_begin(); ++ curve25519_asm_neon(mypublic, secret, basepoint); ++ kernel_neon_end(); ++ } else ++#endif ++ { ++ struct other_stack *s = kzalloc(sizeof(struct other_stack), GFP_KERNEL); ++ if (unlikely(!s)) ++ return false; ++ ++ memcpy(s->ee, secret, 32); ++ normalize_secret(s->ee); ++ ++ fexpand(s->bp, basepoint); ++ cmult(s, s->x, s->z, s->ee, s->bp); ++ crecip(s->zmone, s->z); ++ fmul(s->z, s->x, s->zmone); ++ fcontract(mypublic, s->z); ++ ++ kzfree(s); ++ } ++ return crypto_memneq(mypublic, null_point, CURVE25519_POINT_SIZE); ++} ++#endif ++bool curve25519_generate_public(u8 pub[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE]) ++{ ++ static const u8 basepoint[CURVE25519_POINT_SIZE] __aligned(32) = { 9 }; ++ return curve25519(pub, secret, basepoint); +} +#endif + +void curve25519_generate_secret(u8 secret[CURVE25519_POINT_SIZE]) +{ -+ get_random_bytes(secret, CURVE25519_POINT_SIZE); ++ get_random_bytes_wait(secret, CURVE25519_POINT_SIZE); + normalize_secret(secret); +} + -+void curve25519_generate_public(u8 pub[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE]) -+{ -+ static const u8 basepoint[CURVE25519_POINT_SIZE] = { 9 }; -+ curve25519(pub, secret, basepoint); -+} -+ +#include "../selftest/curve25519.h" ---- /dev/null -+++ b/net/wireguard/crypto/blake2s.h -@@ -0,0 +1,36 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/crypto/blake2s.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,38 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#ifndef BLAKE2S_H @@ -8389,14 +9174,16 @@ + +void blake2s_hmac(u8 *out, const u8 *in, const u8 *key, const u8 outlen, const u64 inlen, const u64 keylen); + ++void blake2s_fpu_init(void); ++ +#ifdef DEBUG +bool blake2s_selftest(void); +#endif + +#endif ---- /dev/null -+++ b/net/wireguard/crypto/chacha20poly1305.h -@@ -0,0 +1,78 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/crypto/chacha20poly1305.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,88 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#ifndef CHACHA20POLY1305_H @@ -8412,61 +9199,71 @@ + CHACHA20POLY1305_AUTHTAGLEN = 16 +}; + -+void chacha20poly1305_init(void); ++void chacha20poly1305_fpu_init(void); + -+bool chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, ++void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, + const u8 *ad, const size_t ad_len, + const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN]); + -+bool chacha20poly1305_encrypt_sg(struct scatterlist *dst, struct scatterlist *src, const size_t src_len, ++bool __must_check chacha20poly1305_encrypt_sg(struct scatterlist *dst, struct scatterlist *src, const size_t src_len, + const u8 *ad, const size_t ad_len, + const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN], + bool have_simd); + -+bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, ++bool __must_check chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, + const u8 *ad, const size_t ad_len, + const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN]); + -+bool chacha20poly1305_decrypt_sg(struct scatterlist *dst, struct scatterlist *src, const size_t src_len, ++bool __must_check chacha20poly1305_decrypt_sg(struct scatterlist *dst, struct scatterlist *src, const size_t src_len, + const u8 *ad, const size_t ad_len, + const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN]); + -+bool xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, ++void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, + const u8 *ad, const size_t ad_len, + const u8 nonce[XCHACHA20POLY1305_NONCELEN], + const u8 key[CHACHA20POLY1305_KEYLEN]); + -+bool xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, ++bool __must_check xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, + const u8 *ad, const size_t ad_len, + const u8 nonce[XCHACHA20POLY1305_NONCELEN], + const u8 key[CHACHA20POLY1305_KEYLEN]); + -+#ifdef CONFIG_X86_64 ++#if defined(CONFIG_X86_64) +#include -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) +#include +#include -+#else -+#include -+#endif ++#elif IS_ENABLED(CONFIG_KERNEL_MODE_NEON) ++#include ++#include +#endif + +static inline bool chacha20poly1305_init_simd(void) +{ + bool have_simd = false; -+#ifdef CONFIG_X86_64 ++#if defined(CONFIG_X86_64) + have_simd = irq_fpu_usable(); + if (have_simd) + kernel_fpu_begin(); ++#elif IS_ENABLED(CONFIG_KERNEL_MODE_NEON) ++#if defined(CONFIG_ARM64) ++ have_simd = true; /* ARM64 supports NEON in any context. */ ++#elif defined(CONFIG_ARM) ++ have_simd = may_use_simd(); /* ARM doesn't support NEON in interrupt context. */ ++#endif ++ if (have_simd) ++ kernel_neon_begin(); +#endif + return have_simd; +} + +static inline void chacha20poly1305_deinit_simd(bool was_on) +{ -+#ifdef CONFIG_X86_64 ++#if defined(CONFIG_X86_64) + if (was_on) + kernel_fpu_end(); ++#elif IS_ENABLED(CONFIG_KERNEL_MODE_NEON) ++ if (was_on) ++ kernel_neon_end(); +#endif +} + @@ -8475,9 +9272,9 @@ +#endif + +#endif ---- /dev/null -+++ b/net/wireguard/crypto/curve25519.h -@@ -0,0 +1,20 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/crypto/curve25519.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,22 @@ +/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ + +#ifndef CURVE25519_H @@ -8489,18 +9286,599 @@ + CURVE25519_POINT_SIZE = 32 +}; + -+void curve25519(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]); ++bool __must_check curve25519(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]); +void curve25519_generate_secret(u8 secret[CURVE25519_POINT_SIZE]); -+void curve25519_generate_public(u8 pub[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE]); ++bool __must_check curve25519_generate_public(u8 pub[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE]); ++ ++void curve25519_fpu_init(void); + +#ifdef DEBUG +bool curve25519_selftest(void); +#endif + +#endif ---- /dev/null -+++ b/net/wireguard/crypto/chacha20-avx2-x86_64.S -@@ -0,0 +1,443 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/crypto/blake2s-avx-x86_64.S 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,576 @@ ++/* ++ * Copyright (C) 2017 Jason A. Donenfeld . All Rights Reserved. ++ * Based on algorithms from Samuel Neves ++ */ ++ ++#include ++ ++.section .rodata.cst32.BLAKECONST, "aM", @progbits, 32 ++.align 32 ++IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 ++ .octa 0x5BE0CD191F83D9AB9B05688C510E527F ++.section .rodata.cst16.ROT16, "aM", @progbits, 16 ++.align 16 ++ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 ++.section .rodata.cst16.ROR328, "aM", @progbits, 16 ++.align 16 ++ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 ++ ++.text ++ENTRY(blake2s_compress_avx) ++ vmovdqu IV+16(%rip), %xmm1 ++ vmovdqu (%rsi), %xmm4 ++ vpxor 32(%rdi), %xmm1, %xmm1 ++ vmovdqu 16(%rsi), %xmm3 ++ vshufps $136, %xmm3, %xmm4, %xmm6 ++ vmovdqa ROT16(%rip), %xmm7 ++ vpaddd (%rdi), %xmm6, %xmm6 ++ vpaddd 16(%rdi), %xmm6, %xmm6 ++ vpxor %xmm6, %xmm1, %xmm1 ++ vmovdqu IV(%rip), %xmm8 ++ vpshufb %xmm7, %xmm1, %xmm1 ++ vmovdqu 48(%rsi), %xmm5 ++ vpaddd %xmm1, %xmm8, %xmm8 ++ vpxor 16(%rdi), %xmm8, %xmm9 ++ vmovdqu 32(%rsi), %xmm2 ++ vpblendw $12, %xmm3, %xmm5, %xmm13 ++ vshufps $221, %xmm5, %xmm2, %xmm12 ++ vpunpckhqdq %xmm2, %xmm4, %xmm14 ++ vpslld $20, %xmm9, %xmm0 ++ vpsrld $12, %xmm9, %xmm9 ++ vpxor %xmm0, %xmm9, %xmm0 ++ vshufps $221, %xmm3, %xmm4, %xmm9 ++ vpaddd %xmm9, %xmm6, %xmm9 ++ vpaddd %xmm0, %xmm9, %xmm9 ++ vpxor %xmm9, %xmm1, %xmm1 ++ vmovdqa ROR328(%rip), %xmm6 ++ vpshufb %xmm6, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm8, %xmm8 ++ vpxor %xmm8, %xmm0, %xmm0 ++ vpshufd $147, %xmm1, %xmm1 ++ vpshufd $78, %xmm8, %xmm8 ++ vpslld $25, %xmm0, %xmm10 ++ vpsrld $7, %xmm0, %xmm0 ++ vpxor %xmm10, %xmm0, %xmm0 ++ vshufps $136, %xmm5, %xmm2, %xmm10 ++ vpshufd $57, %xmm0, %xmm0 ++ vpaddd %xmm10, %xmm9, %xmm9 ++ vpaddd %xmm0, %xmm9, %xmm9 ++ vpxor %xmm9, %xmm1, %xmm1 ++ vpaddd %xmm12, %xmm9, %xmm9 ++ vpblendw $12, %xmm2, %xmm3, %xmm12 ++ vpshufb %xmm7, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm8, %xmm8 ++ vpxor %xmm8, %xmm0, %xmm10 ++ vpslld $20, %xmm10, %xmm0 ++ vpsrld $12, %xmm10, %xmm10 ++ vpxor %xmm0, %xmm10, %xmm0 ++ vpaddd %xmm0, %xmm9, %xmm9 ++ vpxor %xmm9, %xmm1, %xmm1 ++ vpshufb %xmm6, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm8, %xmm8 ++ vpxor %xmm8, %xmm0, %xmm0 ++ vpshufd $57, %xmm1, %xmm1 ++ vpshufd $78, %xmm8, %xmm8 ++ vpslld $25, %xmm0, %xmm10 ++ vpsrld $7, %xmm0, %xmm0 ++ vpxor %xmm10, %xmm0, %xmm0 ++ vpslldq $4, %xmm5, %xmm10 ++ vpblendw $240, %xmm10, %xmm12, %xmm12 ++ vpshufd $147, %xmm0, %xmm0 ++ vpshufd $147, %xmm12, %xmm12 ++ vpaddd %xmm9, %xmm12, %xmm12 ++ vpaddd %xmm0, %xmm12, %xmm12 ++ vpxor %xmm12, %xmm1, %xmm1 ++ vpshufb %xmm7, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm8, %xmm8 ++ vpxor %xmm8, %xmm0, %xmm11 ++ vpslld $20, %xmm11, %xmm9 ++ vpsrld $12, %xmm11, %xmm11 ++ vpxor %xmm9, %xmm11, %xmm0 ++ vpshufd $8, %xmm2, %xmm9 ++ vpblendw $192, %xmm5, %xmm3, %xmm11 ++ vpblendw $240, %xmm11, %xmm9, %xmm9 ++ vpshufd $177, %xmm9, %xmm9 ++ vpaddd %xmm12, %xmm9, %xmm9 ++ vpaddd %xmm0, %xmm9, %xmm11 ++ vpxor %xmm11, %xmm1, %xmm1 ++ vpshufb %xmm6, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm8, %xmm8 ++ vpxor %xmm8, %xmm0, %xmm9 ++ vpshufd $147, %xmm1, %xmm1 ++ vpshufd $78, %xmm8, %xmm8 ++ vpslld $25, %xmm9, %xmm0 ++ vpsrld $7, %xmm9, %xmm9 ++ vpxor %xmm0, %xmm9, %xmm0 ++ vpslldq $4, %xmm3, %xmm9 ++ vpblendw $48, %xmm9, %xmm2, %xmm9 ++ vpblendw $240, %xmm9, %xmm4, %xmm9 ++ vpshufd $57, %xmm0, %xmm0 ++ vpshufd $177, %xmm9, %xmm9 ++ vpaddd %xmm11, %xmm9, %xmm9 ++ vpaddd %xmm0, %xmm9, %xmm9 ++ vpxor %xmm9, %xmm1, %xmm1 ++ vpshufb %xmm7, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm8, %xmm11 ++ vpxor %xmm11, %xmm0, %xmm0 ++ vpslld $20, %xmm0, %xmm8 ++ vpsrld $12, %xmm0, %xmm0 ++ vpxor %xmm8, %xmm0, %xmm0 ++ vpunpckhdq %xmm3, %xmm4, %xmm8 ++ vpblendw $12, %xmm10, %xmm8, %xmm12 ++ vpshufd $177, %xmm12, %xmm12 ++ vpaddd %xmm9, %xmm12, %xmm9 ++ vpaddd %xmm0, %xmm9, %xmm9 ++ vpxor %xmm9, %xmm1, %xmm1 ++ vpshufb %xmm6, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm11, %xmm11 ++ vpxor %xmm11, %xmm0, %xmm0 ++ vpshufd $57, %xmm1, %xmm1 ++ vpshufd $78, %xmm11, %xmm11 ++ vpslld $25, %xmm0, %xmm12 ++ vpsrld $7, %xmm0, %xmm0 ++ vpxor %xmm12, %xmm0, %xmm0 ++ vpunpckhdq %xmm5, %xmm2, %xmm12 ++ vpshufd $147, %xmm0, %xmm0 ++ vpblendw $15, %xmm13, %xmm12, %xmm12 ++ vpslldq $8, %xmm5, %xmm13 ++ vpshufd $210, %xmm12, %xmm12 ++ vpaddd %xmm9, %xmm12, %xmm9 ++ vpaddd %xmm0, %xmm9, %xmm9 ++ vpxor %xmm9, %xmm1, %xmm1 ++ vpshufb %xmm7, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm11, %xmm11 ++ vpxor %xmm11, %xmm0, %xmm0 ++ vpslld $20, %xmm0, %xmm12 ++ vpsrld $12, %xmm0, %xmm0 ++ vpxor %xmm12, %xmm0, %xmm0 ++ vpunpckldq %xmm4, %xmm2, %xmm12 ++ vpblendw $240, %xmm4, %xmm12, %xmm12 ++ vpblendw $192, %xmm13, %xmm12, %xmm12 ++ vpsrldq $12, %xmm3, %xmm13 ++ vpaddd %xmm12, %xmm9, %xmm9 ++ vpaddd %xmm0, %xmm9, %xmm9 ++ vpxor %xmm9, %xmm1, %xmm1 ++ vpshufb %xmm6, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm11, %xmm11 ++ vpxor %xmm11, %xmm0, %xmm0 ++ vpshufd $147, %xmm1, %xmm1 ++ vpshufd $78, %xmm11, %xmm11 ++ vpslld $25, %xmm0, %xmm12 ++ vpsrld $7, %xmm0, %xmm0 ++ vpxor %xmm12, %xmm0, %xmm0 ++ vpblendw $60, %xmm2, %xmm4, %xmm12 ++ vpblendw $3, %xmm13, %xmm12, %xmm12 ++ vpshufd $57, %xmm0, %xmm0 ++ vpshufd $78, %xmm12, %xmm12 ++ vpaddd %xmm9, %xmm12, %xmm9 ++ vpaddd %xmm0, %xmm9, %xmm9 ++ vpxor %xmm9, %xmm1, %xmm1 ++ vpshufb %xmm7, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm11, %xmm11 ++ vpxor %xmm11, %xmm0, %xmm12 ++ vpslld $20, %xmm12, %xmm13 ++ vpsrld $12, %xmm12, %xmm0 ++ vpblendw $51, %xmm3, %xmm4, %xmm12 ++ vpxor %xmm13, %xmm0, %xmm0 ++ vpblendw $192, %xmm10, %xmm12, %xmm10 ++ vpslldq $8, %xmm2, %xmm12 ++ vpshufd $27, %xmm10, %xmm10 ++ vpaddd %xmm9, %xmm10, %xmm9 ++ vpaddd %xmm0, %xmm9, %xmm9 ++ vpxor %xmm9, %xmm1, %xmm1 ++ vpshufb %xmm6, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm11, %xmm11 ++ vpxor %xmm11, %xmm0, %xmm0 ++ vpshufd $57, %xmm1, %xmm1 ++ vpshufd $78, %xmm11, %xmm11 ++ vpslld $25, %xmm0, %xmm10 ++ vpsrld $7, %xmm0, %xmm0 ++ vpxor %xmm10, %xmm0, %xmm0 ++ vpunpckhdq %xmm2, %xmm8, %xmm10 ++ vpshufd $147, %xmm0, %xmm0 ++ vpblendw $12, %xmm5, %xmm10, %xmm10 ++ vpshufd $210, %xmm10, %xmm10 ++ vpaddd %xmm9, %xmm10, %xmm9 ++ vpaddd %xmm0, %xmm9, %xmm9 ++ vpxor %xmm9, %xmm1, %xmm1 ++ vpshufb %xmm7, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm11, %xmm11 ++ vpxor %xmm11, %xmm0, %xmm10 ++ vpslld $20, %xmm10, %xmm0 ++ vpsrld $12, %xmm10, %xmm10 ++ vpxor %xmm0, %xmm10, %xmm0 ++ vpblendw $12, %xmm4, %xmm5, %xmm10 ++ vpblendw $192, %xmm12, %xmm10, %xmm10 ++ vpunpckldq %xmm2, %xmm4, %xmm12 ++ vpshufd $135, %xmm10, %xmm10 ++ vpaddd %xmm9, %xmm10, %xmm9 ++ vpaddd %xmm0, %xmm9, %xmm9 ++ vpxor %xmm9, %xmm1, %xmm1 ++ vpshufb %xmm6, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm11, %xmm13 ++ vpxor %xmm13, %xmm0, %xmm0 ++ vpshufd $147, %xmm1, %xmm1 ++ vpshufd $78, %xmm13, %xmm13 ++ vpslld $25, %xmm0, %xmm10 ++ vpsrld $7, %xmm0, %xmm0 ++ vpxor %xmm10, %xmm0, %xmm0 ++ vpblendw $15, %xmm3, %xmm4, %xmm10 ++ vpblendw $192, %xmm5, %xmm10, %xmm10 ++ vpshufd $57, %xmm0, %xmm0 ++ vpshufd $198, %xmm10, %xmm10 ++ vpaddd %xmm9, %xmm10, %xmm10 ++ vpaddd %xmm0, %xmm10, %xmm10 ++ vpxor %xmm10, %xmm1, %xmm1 ++ vpshufb %xmm7, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm13, %xmm13 ++ vpxor %xmm13, %xmm0, %xmm9 ++ vpslld $20, %xmm9, %xmm0 ++ vpsrld $12, %xmm9, %xmm9 ++ vpxor %xmm0, %xmm9, %xmm0 ++ vpunpckhdq %xmm2, %xmm3, %xmm9 ++ vpunpcklqdq %xmm12, %xmm9, %xmm15 ++ vpunpcklqdq %xmm12, %xmm8, %xmm12 ++ vpblendw $15, %xmm5, %xmm8, %xmm8 ++ vpaddd %xmm15, %xmm10, %xmm15 ++ vpaddd %xmm0, %xmm15, %xmm15 ++ vpxor %xmm15, %xmm1, %xmm1 ++ vpshufd $141, %xmm8, %xmm8 ++ vpshufb %xmm6, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm13, %xmm13 ++ vpxor %xmm13, %xmm0, %xmm0 ++ vpshufd $57, %xmm1, %xmm1 ++ vpshufd $78, %xmm13, %xmm13 ++ vpslld $25, %xmm0, %xmm10 ++ vpsrld $7, %xmm0, %xmm0 ++ vpxor %xmm10, %xmm0, %xmm0 ++ vpunpcklqdq %xmm2, %xmm3, %xmm10 ++ vpshufd $147, %xmm0, %xmm0 ++ vpblendw $51, %xmm14, %xmm10, %xmm14 ++ vpshufd $135, %xmm14, %xmm14 ++ vpaddd %xmm15, %xmm14, %xmm14 ++ vpaddd %xmm0, %xmm14, %xmm14 ++ vpxor %xmm14, %xmm1, %xmm1 ++ vpunpcklqdq %xmm3, %xmm4, %xmm15 ++ vpshufb %xmm7, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm13, %xmm13 ++ vpxor %xmm13, %xmm0, %xmm0 ++ vpslld $20, %xmm0, %xmm11 ++ vpsrld $12, %xmm0, %xmm0 ++ vpxor %xmm11, %xmm0, %xmm0 ++ vpunpckhqdq %xmm5, %xmm3, %xmm11 ++ vpblendw $51, %xmm15, %xmm11, %xmm11 ++ vpunpckhqdq %xmm3, %xmm5, %xmm15 ++ vpaddd %xmm11, %xmm14, %xmm11 ++ vpaddd %xmm0, %xmm11, %xmm11 ++ vpxor %xmm11, %xmm1, %xmm1 ++ vpshufb %xmm6, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm13, %xmm13 ++ vpxor %xmm13, %xmm0, %xmm0 ++ vpshufd $147, %xmm1, %xmm1 ++ vpshufd $78, %xmm13, %xmm13 ++ vpslld $25, %xmm0, %xmm14 ++ vpsrld $7, %xmm0, %xmm0 ++ vpxor %xmm14, %xmm0, %xmm14 ++ vpunpckhqdq %xmm4, %xmm2, %xmm0 ++ vpshufd $57, %xmm14, %xmm14 ++ vpblendw $51, %xmm15, %xmm0, %xmm15 ++ vpaddd %xmm15, %xmm11, %xmm15 ++ vpaddd %xmm14, %xmm15, %xmm15 ++ vpxor %xmm15, %xmm1, %xmm1 ++ vpshufb %xmm7, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm13, %xmm13 ++ vpxor %xmm13, %xmm14, %xmm14 ++ vpslld $20, %xmm14, %xmm11 ++ vpsrld $12, %xmm14, %xmm14 ++ vpxor %xmm11, %xmm14, %xmm14 ++ vpblendw $3, %xmm2, %xmm4, %xmm11 ++ vpslldq $8, %xmm11, %xmm0 ++ vpblendw $15, %xmm5, %xmm0, %xmm0 ++ vpshufd $99, %xmm0, %xmm0 ++ vpaddd %xmm15, %xmm0, %xmm15 ++ vpaddd %xmm14, %xmm15, %xmm15 ++ vpxor %xmm15, %xmm1, %xmm0 ++ vpaddd %xmm12, %xmm15, %xmm15 ++ vpshufb %xmm6, %xmm0, %xmm0 ++ vpaddd %xmm0, %xmm13, %xmm13 ++ vpxor %xmm13, %xmm14, %xmm14 ++ vpshufd $57, %xmm0, %xmm0 ++ vpshufd $78, %xmm13, %xmm13 ++ vpslld $25, %xmm14, %xmm1 ++ vpsrld $7, %xmm14, %xmm14 ++ vpxor %xmm1, %xmm14, %xmm14 ++ vpblendw $3, %xmm5, %xmm4, %xmm1 ++ vpshufd $147, %xmm14, %xmm14 ++ vpaddd %xmm14, %xmm15, %xmm15 ++ vpxor %xmm15, %xmm0, %xmm0 ++ vpshufb %xmm7, %xmm0, %xmm0 ++ vpaddd %xmm0, %xmm13, %xmm13 ++ vpxor %xmm13, %xmm14, %xmm14 ++ vpslld $20, %xmm14, %xmm12 ++ vpsrld $12, %xmm14, %xmm14 ++ vpxor %xmm12, %xmm14, %xmm14 ++ vpsrldq $4, %xmm2, %xmm12 ++ vpblendw $60, %xmm12, %xmm1, %xmm1 ++ vpaddd %xmm1, %xmm15, %xmm15 ++ vpaddd %xmm14, %xmm15, %xmm15 ++ vpxor %xmm15, %xmm0, %xmm0 ++ vpblendw $12, %xmm4, %xmm3, %xmm1 ++ vpshufb %xmm6, %xmm0, %xmm0 ++ vpaddd %xmm0, %xmm13, %xmm13 ++ vpxor %xmm13, %xmm14, %xmm14 ++ vpshufd $147, %xmm0, %xmm0 ++ vpshufd $78, %xmm13, %xmm13 ++ vpslld $25, %xmm14, %xmm12 ++ vpsrld $7, %xmm14, %xmm14 ++ vpxor %xmm12, %xmm14, %xmm14 ++ vpsrldq $4, %xmm5, %xmm12 ++ vpblendw $48, %xmm12, %xmm1, %xmm1 ++ vpshufd $33, %xmm5, %xmm12 ++ vpshufd $57, %xmm14, %xmm14 ++ vpshufd $108, %xmm1, %xmm1 ++ vpblendw $51, %xmm12, %xmm10, %xmm12 ++ vpaddd %xmm15, %xmm1, %xmm15 ++ vpaddd %xmm14, %xmm15, %xmm15 ++ vpxor %xmm15, %xmm0, %xmm0 ++ vpaddd %xmm12, %xmm15, %xmm15 ++ vpshufb %xmm7, %xmm0, %xmm0 ++ vpaddd %xmm0, %xmm13, %xmm1 ++ vpxor %xmm1, %xmm14, %xmm14 ++ vpslld $20, %xmm14, %xmm13 ++ vpsrld $12, %xmm14, %xmm14 ++ vpxor %xmm13, %xmm14, %xmm14 ++ vpslldq $12, %xmm3, %xmm13 ++ vpaddd %xmm14, %xmm15, %xmm15 ++ vpxor %xmm15, %xmm0, %xmm0 ++ vpshufb %xmm6, %xmm0, %xmm0 ++ vpaddd %xmm0, %xmm1, %xmm1 ++ vpxor %xmm1, %xmm14, %xmm14 ++ vpshufd $57, %xmm0, %xmm0 ++ vpshufd $78, %xmm1, %xmm1 ++ vpslld $25, %xmm14, %xmm12 ++ vpsrld $7, %xmm14, %xmm14 ++ vpxor %xmm12, %xmm14, %xmm14 ++ vpblendw $51, %xmm5, %xmm4, %xmm12 ++ vpshufd $147, %xmm14, %xmm14 ++ vpblendw $192, %xmm13, %xmm12, %xmm12 ++ vpaddd %xmm12, %xmm15, %xmm15 ++ vpaddd %xmm14, %xmm15, %xmm15 ++ vpxor %xmm15, %xmm0, %xmm0 ++ vpsrldq $4, %xmm3, %xmm12 ++ vpshufb %xmm7, %xmm0, %xmm0 ++ vpaddd %xmm0, %xmm1, %xmm1 ++ vpxor %xmm1, %xmm14, %xmm14 ++ vpslld $20, %xmm14, %xmm13 ++ vpsrld $12, %xmm14, %xmm14 ++ vpxor %xmm13, %xmm14, %xmm14 ++ vpblendw $48, %xmm2, %xmm5, %xmm13 ++ vpblendw $3, %xmm12, %xmm13, %xmm13 ++ vpshufd $156, %xmm13, %xmm13 ++ vpaddd %xmm15, %xmm13, %xmm15 ++ vpaddd %xmm14, %xmm15, %xmm15 ++ vpxor %xmm15, %xmm0, %xmm0 ++ vpshufb %xmm6, %xmm0, %xmm0 ++ vpaddd %xmm0, %xmm1, %xmm1 ++ vpxor %xmm1, %xmm14, %xmm14 ++ vpshufd $147, %xmm0, %xmm0 ++ vpshufd $78, %xmm1, %xmm1 ++ vpslld $25, %xmm14, %xmm13 ++ vpsrld $7, %xmm14, %xmm14 ++ vpxor %xmm13, %xmm14, %xmm14 ++ vpunpcklqdq %xmm2, %xmm4, %xmm13 ++ vpshufd $57, %xmm14, %xmm14 ++ vpblendw $12, %xmm12, %xmm13, %xmm12 ++ vpshufd $180, %xmm12, %xmm12 ++ vpaddd %xmm15, %xmm12, %xmm15 ++ vpaddd %xmm14, %xmm15, %xmm15 ++ vpxor %xmm15, %xmm0, %xmm0 ++ vpshufb %xmm7, %xmm0, %xmm0 ++ vpaddd %xmm0, %xmm1, %xmm1 ++ vpxor %xmm1, %xmm14, %xmm14 ++ vpslld $20, %xmm14, %xmm12 ++ vpsrld $12, %xmm14, %xmm14 ++ vpxor %xmm12, %xmm14, %xmm14 ++ vpunpckhqdq %xmm9, %xmm4, %xmm12 ++ vpshufd $198, %xmm12, %xmm12 ++ vpaddd %xmm15, %xmm12, %xmm15 ++ vpaddd %xmm14, %xmm15, %xmm15 ++ vpxor %xmm15, %xmm0, %xmm0 ++ vpaddd %xmm15, %xmm8, %xmm15 ++ vpshufb %xmm6, %xmm0, %xmm0 ++ vpaddd %xmm0, %xmm1, %xmm1 ++ vpxor %xmm1, %xmm14, %xmm14 ++ vpshufd $57, %xmm0, %xmm0 ++ vpshufd $78, %xmm1, %xmm1 ++ vpslld $25, %xmm14, %xmm12 ++ vpsrld $7, %xmm14, %xmm14 ++ vpxor %xmm12, %xmm14, %xmm14 ++ vpsrldq $4, %xmm4, %xmm12 ++ vpshufd $147, %xmm14, %xmm14 ++ vpaddd %xmm14, %xmm15, %xmm15 ++ vpxor %xmm15, %xmm0, %xmm0 ++ vpshufb %xmm7, %xmm0, %xmm0 ++ vpaddd %xmm0, %xmm1, %xmm1 ++ vpxor %xmm1, %xmm14, %xmm14 ++ vpslld $20, %xmm14, %xmm8 ++ vpsrld $12, %xmm14, %xmm14 ++ vpxor %xmm14, %xmm8, %xmm14 ++ vpblendw $48, %xmm5, %xmm2, %xmm8 ++ vpblendw $3, %xmm12, %xmm8, %xmm8 ++ vpunpckhqdq %xmm5, %xmm4, %xmm12 ++ vpshufd $75, %xmm8, %xmm8 ++ vpblendw $60, %xmm10, %xmm12, %xmm10 ++ vpaddd %xmm15, %xmm8, %xmm15 ++ vpaddd %xmm14, %xmm15, %xmm15 ++ vpxor %xmm0, %xmm15, %xmm0 ++ vpshufd $45, %xmm10, %xmm10 ++ vpshufb %xmm6, %xmm0, %xmm0 ++ vpaddd %xmm15, %xmm10, %xmm15 ++ vpaddd %xmm0, %xmm1, %xmm1 ++ vpxor %xmm1, %xmm14, %xmm14 ++ vpshufd $147, %xmm0, %xmm0 ++ vpshufd $78, %xmm1, %xmm1 ++ vpslld $25, %xmm14, %xmm8 ++ vpsrld $7, %xmm14, %xmm14 ++ vpxor %xmm14, %xmm8, %xmm8 ++ vpshufd $57, %xmm8, %xmm8 ++ vpaddd %xmm8, %xmm15, %xmm15 ++ vpxor %xmm0, %xmm15, %xmm0 ++ vpshufb %xmm7, %xmm0, %xmm0 ++ vpaddd %xmm0, %xmm1, %xmm1 ++ vpxor %xmm8, %xmm1, %xmm8 ++ vpslld $20, %xmm8, %xmm10 ++ vpsrld $12, %xmm8, %xmm8 ++ vpxor %xmm8, %xmm10, %xmm10 ++ vpunpckldq %xmm3, %xmm4, %xmm8 ++ vpunpcklqdq %xmm9, %xmm8, %xmm9 ++ vpaddd %xmm9, %xmm15, %xmm9 ++ vpaddd %xmm10, %xmm9, %xmm9 ++ vpxor %xmm0, %xmm9, %xmm8 ++ vpshufb %xmm6, %xmm8, %xmm8 ++ vpaddd %xmm8, %xmm1, %xmm1 ++ vpxor %xmm1, %xmm10, %xmm10 ++ vpshufd $57, %xmm8, %xmm8 ++ vpshufd $78, %xmm1, %xmm1 ++ vpslld $25, %xmm10, %xmm12 ++ vpsrld $7, %xmm10, %xmm10 ++ vpxor %xmm10, %xmm12, %xmm10 ++ vpblendw $48, %xmm4, %xmm3, %xmm12 ++ vpshufd $147, %xmm10, %xmm0 ++ vpunpckhdq %xmm5, %xmm3, %xmm10 ++ vpshufd $78, %xmm12, %xmm12 ++ vpunpcklqdq %xmm4, %xmm10, %xmm10 ++ vpblendw $192, %xmm2, %xmm10, %xmm10 ++ vpshufhw $78, %xmm10, %xmm10 ++ vpaddd %xmm10, %xmm9, %xmm10 ++ vpaddd %xmm0, %xmm10, %xmm10 ++ vpxor %xmm8, %xmm10, %xmm8 ++ vpshufb %xmm7, %xmm8, %xmm8 ++ vpaddd %xmm8, %xmm1, %xmm1 ++ vpxor %xmm0, %xmm1, %xmm9 ++ vpslld $20, %xmm9, %xmm0 ++ vpsrld $12, %xmm9, %xmm9 ++ vpxor %xmm9, %xmm0, %xmm0 ++ vpunpckhdq %xmm5, %xmm4, %xmm9 ++ vpblendw $240, %xmm9, %xmm2, %xmm13 ++ vpshufd $39, %xmm13, %xmm13 ++ vpaddd %xmm10, %xmm13, %xmm10 ++ vpaddd %xmm0, %xmm10, %xmm10 ++ vpxor %xmm8, %xmm10, %xmm8 ++ vpblendw $12, %xmm4, %xmm2, %xmm13 ++ vpshufb %xmm6, %xmm8, %xmm8 ++ vpslldq $4, %xmm13, %xmm13 ++ vpblendw $15, %xmm5, %xmm13, %xmm13 ++ vpaddd %xmm8, %xmm1, %xmm1 ++ vpxor %xmm1, %xmm0, %xmm0 ++ vpaddd %xmm13, %xmm10, %xmm13 ++ vpshufd $147, %xmm8, %xmm8 ++ vpshufd $78, %xmm1, %xmm1 ++ vpslld $25, %xmm0, %xmm14 ++ vpsrld $7, %xmm0, %xmm0 ++ vpxor %xmm0, %xmm14, %xmm14 ++ vpshufd $57, %xmm14, %xmm14 ++ vpaddd %xmm14, %xmm13, %xmm13 ++ vpxor %xmm8, %xmm13, %xmm8 ++ vpaddd %xmm13, %xmm12, %xmm12 ++ vpshufb %xmm7, %xmm8, %xmm8 ++ vpaddd %xmm8, %xmm1, %xmm1 ++ vpxor %xmm14, %xmm1, %xmm14 ++ vpslld $20, %xmm14, %xmm10 ++ vpsrld $12, %xmm14, %xmm14 ++ vpxor %xmm14, %xmm10, %xmm10 ++ vpaddd %xmm10, %xmm12, %xmm12 ++ vpxor %xmm8, %xmm12, %xmm8 ++ vpshufb %xmm6, %xmm8, %xmm8 ++ vpaddd %xmm8, %xmm1, %xmm1 ++ vpxor %xmm1, %xmm10, %xmm0 ++ vpshufd $57, %xmm8, %xmm8 ++ vpshufd $78, %xmm1, %xmm1 ++ vpslld $25, %xmm0, %xmm10 ++ vpsrld $7, %xmm0, %xmm0 ++ vpxor %xmm0, %xmm10, %xmm10 ++ vpblendw $48, %xmm2, %xmm3, %xmm0 ++ vpblendw $15, %xmm11, %xmm0, %xmm0 ++ vpshufd $147, %xmm10, %xmm10 ++ vpshufd $114, %xmm0, %xmm0 ++ vpaddd %xmm12, %xmm0, %xmm0 ++ vpaddd %xmm10, %xmm0, %xmm0 ++ vpxor %xmm8, %xmm0, %xmm8 ++ vpshufb %xmm7, %xmm8, %xmm8 ++ vpaddd %xmm8, %xmm1, %xmm1 ++ vpxor %xmm10, %xmm1, %xmm10 ++ vpslld $20, %xmm10, %xmm11 ++ vpsrld $12, %xmm10, %xmm10 ++ vpxor %xmm10, %xmm11, %xmm10 ++ vpslldq $4, %xmm4, %xmm11 ++ vpblendw $192, %xmm11, %xmm3, %xmm3 ++ vpunpckldq %xmm5, %xmm4, %xmm4 ++ vpshufd $99, %xmm3, %xmm3 ++ vpaddd %xmm0, %xmm3, %xmm3 ++ vpaddd %xmm10, %xmm3, %xmm3 ++ vpxor %xmm8, %xmm3, %xmm11 ++ vpunpckldq %xmm5, %xmm2, %xmm0 ++ vpblendw $192, %xmm2, %xmm5, %xmm2 ++ vpshufb %xmm6, %xmm11, %xmm11 ++ vpunpckhqdq %xmm0, %xmm9, %xmm0 ++ vpblendw $15, %xmm4, %xmm2, %xmm4 ++ vpaddd %xmm11, %xmm1, %xmm1 ++ vpxor %xmm1, %xmm10, %xmm10 ++ vpshufd $147, %xmm11, %xmm11 ++ vpshufd $201, %xmm0, %xmm0 ++ vpslld $25, %xmm10, %xmm8 ++ vpsrld $7, %xmm10, %xmm10 ++ vpxor %xmm10, %xmm8, %xmm10 ++ vpshufd $78, %xmm1, %xmm1 ++ vpaddd %xmm3, %xmm0, %xmm0 ++ vpshufd $27, %xmm4, %xmm4 ++ vpshufd $57, %xmm10, %xmm10 ++ vpaddd %xmm10, %xmm0, %xmm0 ++ vpxor %xmm11, %xmm0, %xmm11 ++ vpaddd %xmm0, %xmm4, %xmm0 ++ vpshufb %xmm7, %xmm11, %xmm7 ++ vpaddd %xmm7, %xmm1, %xmm1 ++ vpxor %xmm10, %xmm1, %xmm10 ++ vpslld $20, %xmm10, %xmm8 ++ vpsrld $12, %xmm10, %xmm10 ++ vpxor %xmm10, %xmm8, %xmm8 ++ vpaddd %xmm8, %xmm0, %xmm0 ++ vpxor %xmm7, %xmm0, %xmm7 ++ vpshufb %xmm6, %xmm7, %xmm6 ++ vpaddd %xmm6, %xmm1, %xmm1 ++ vpxor %xmm1, %xmm8, %xmm8 ++ vpshufd $78, %xmm1, %xmm1 ++ vpshufd $57, %xmm6, %xmm6 ++ vpslld $25, %xmm8, %xmm2 ++ vpsrld $7, %xmm8, %xmm8 ++ vpxor %xmm8, %xmm2, %xmm8 ++ vpxor (%rdi), %xmm1, %xmm1 ++ vpshufd $147, %xmm8, %xmm8 ++ vpxor %xmm0, %xmm1, %xmm0 ++ vmovups %xmm0, (%rdi) ++ vpxor 16(%rdi), %xmm8, %xmm0 ++ vpxor %xmm6, %xmm0, %xmm6 ++ vmovups %xmm6, 16(%rdi) ++ ret ++ENDPROC(blake2s_compress_avx) +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/crypto/chacha20-avx2-x86_64.S 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,446 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions + * @@ -8514,13 +9892,16 @@ + +#include + -+.data ++.section .rodata.cst32.ROT8, "aM", @progbits, 32 +.align 32 -+ +ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 + .octa 0x0e0d0c0f0a09080b0605040702010003 ++.section .rodata.cst32.ROT16, "aM", @progbits, 32 ++.align 32 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 + .octa 0x0d0c0f0e09080b0a0504070601000302 ++.section .rodata.cst32.CTRINC, "aM", @progbits, 32 ++.align 32 +CTRINC: .octa 0x00000003000000020000000100000000 + .octa 0x00000007000000060000000500000004 + @@ -8944,10 +10325,19 @@ + mov %r8,%rsp + ret +ENDPROC(chacha20_asm_8block_xor_avx2) ---- /dev/null -+++ b/net/wireguard/crypto/chacha20-ssse3-x86_64.S -@@ -0,0 +1,627 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/crypto/chacha20-neon-arm64.S 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,450 @@ +/* ++ * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions ++ * ++ * Copyright (C) 2016 Linaro, Ltd. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi @@ -8960,12 +10350,989 @@ + +#include + -+.data -+.align 16 ++ .text ++ .align 6 + -+ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 ++ENTRY(chacha20_asm_block_xor_neon) ++ // x0: Input state matrix, s ++ // x1: 1 data block output, o ++ // x2: 1 data block input, i ++ ++ // ++ // This function encrypts one ChaCha20 block by loading the state matrix ++ // in four NEON registers. It performs matrix operation on four words in ++ // parallel, but requires shuffling to rearrange the words after each ++ // round. ++ // ++ ++ // x0..3 = s0..3 ++ adr x3, ROT8 ++ ld1 {v0.4s-v3.4s}, [x0] ++ ld1 {v8.4s-v11.4s}, [x0] ++ ld1 {v12.4s}, [x3] ++ ++ mov x3, #10 ++ ++.Ldoubleround: ++ // x0 += x1, x3 = rotl32(x3 ^ x0, 16) ++ add v0.4s, v0.4s, v1.4s ++ eor v3.16b, v3.16b, v0.16b ++ rev32 v3.8h, v3.8h ++ ++ // x2 += x3, x1 = rotl32(x1 ^ x2, 12) ++ add v2.4s, v2.4s, v3.4s ++ eor v4.16b, v1.16b, v2.16b ++ shl v1.4s, v4.4s, #12 ++ sri v1.4s, v4.4s, #20 ++ ++ // x0 += x1, x3 = rotl32(x3 ^ x0, 8) ++ add v0.4s, v0.4s, v1.4s ++ eor v3.16b, v3.16b, v0.16b ++ tbl v3.16b, {v3.16b}, v12.16b ++ ++ // x2 += x3, x1 = rotl32(x1 ^ x2, 7) ++ add v2.4s, v2.4s, v3.4s ++ eor v4.16b, v1.16b, v2.16b ++ shl v1.4s, v4.4s, #7 ++ sri v1.4s, v4.4s, #25 ++ ++ // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) ++ ext v1.16b, v1.16b, v1.16b, #4 ++ // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) ++ ext v2.16b, v2.16b, v2.16b, #8 ++ // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) ++ ext v3.16b, v3.16b, v3.16b, #12 ++ ++ // x0 += x1, x3 = rotl32(x3 ^ x0, 16) ++ add v0.4s, v0.4s, v1.4s ++ eor v3.16b, v3.16b, v0.16b ++ rev32 v3.8h, v3.8h ++ ++ // x2 += x3, x1 = rotl32(x1 ^ x2, 12) ++ add v2.4s, v2.4s, v3.4s ++ eor v4.16b, v1.16b, v2.16b ++ shl v1.4s, v4.4s, #12 ++ sri v1.4s, v4.4s, #20 ++ ++ // x0 += x1, x3 = rotl32(x3 ^ x0, 8) ++ add v0.4s, v0.4s, v1.4s ++ eor v3.16b, v3.16b, v0.16b ++ tbl v3.16b, {v3.16b}, v12.16b ++ ++ // x2 += x3, x1 = rotl32(x1 ^ x2, 7) ++ add v2.4s, v2.4s, v3.4s ++ eor v4.16b, v1.16b, v2.16b ++ shl v1.4s, v4.4s, #7 ++ sri v1.4s, v4.4s, #25 ++ ++ // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) ++ ext v1.16b, v1.16b, v1.16b, #12 ++ // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) ++ ext v2.16b, v2.16b, v2.16b, #8 ++ // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) ++ ext v3.16b, v3.16b, v3.16b, #4 ++ ++ subs x3, x3, #1 ++ b.ne .Ldoubleround ++ ++ ld1 {v4.16b-v7.16b}, [x2] ++ ++ // o0 = i0 ^ (x0 + s0) ++ add v0.4s, v0.4s, v8.4s ++ eor v0.16b, v0.16b, v4.16b ++ ++ // o1 = i1 ^ (x1 + s1) ++ add v1.4s, v1.4s, v9.4s ++ eor v1.16b, v1.16b, v5.16b ++ ++ // o2 = i2 ^ (x2 + s2) ++ add v2.4s, v2.4s, v10.4s ++ eor v2.16b, v2.16b, v6.16b ++ ++ // o3 = i3 ^ (x3 + s3) ++ add v3.4s, v3.4s, v11.4s ++ eor v3.16b, v3.16b, v7.16b ++ ++ st1 {v0.16b-v3.16b}, [x1] ++ ++ ret ++ENDPROC(chacha20_asm_block_xor_neon) ++ ++ .align 6 ++ENTRY(chacha20_asm_4block_xor_neon) ++ // x0: Input state matrix, s ++ // x1: 4 data blocks output, o ++ // x2: 4 data blocks input, i ++ ++ // ++ // This function encrypts four consecutive ChaCha20 blocks by loading ++ // the state matrix in NEON registers four times. The algorithm performs ++ // each operation on the corresponding word of each state matrix, hence ++ // requires no word shuffling. For final XORing step we transpose the ++ // matrix by interleaving 32- and then 64-bit words, which allows us to ++ // do XOR in NEON registers. ++ // ++ adr x3, CTRINC // ... and ROT8 ++ ld1 {v30.4s-v31.4s}, [x3] ++ ++ // x0..15[0-3] = s0..3[0..3] ++ mov x4, x0 ++ ld4r { v0.4s- v3.4s}, [x4], #16 ++ ld4r { v4.4s- v7.4s}, [x4], #16 ++ ld4r { v8.4s-v11.4s}, [x4], #16 ++ ld4r {v12.4s-v15.4s}, [x4] ++ ++ // x12 += counter values 0-3 ++ add v12.4s, v12.4s, v30.4s ++ ++ mov x3, #10 ++ ++.Ldoubleround4: ++ // x0 += x4, x12 = rotl32(x12 ^ x0, 16) ++ // x1 += x5, x13 = rotl32(x13 ^ x1, 16) ++ // x2 += x6, x14 = rotl32(x14 ^ x2, 16) ++ // x3 += x7, x15 = rotl32(x15 ^ x3, 16) ++ add v0.4s, v0.4s, v4.4s ++ add v1.4s, v1.4s, v5.4s ++ add v2.4s, v2.4s, v6.4s ++ add v3.4s, v3.4s, v7.4s ++ ++ eor v12.16b, v12.16b, v0.16b ++ eor v13.16b, v13.16b, v1.16b ++ eor v14.16b, v14.16b, v2.16b ++ eor v15.16b, v15.16b, v3.16b ++ ++ rev32 v12.8h, v12.8h ++ rev32 v13.8h, v13.8h ++ rev32 v14.8h, v14.8h ++ rev32 v15.8h, v15.8h ++ ++ // x8 += x12, x4 = rotl32(x4 ^ x8, 12) ++ // x9 += x13, x5 = rotl32(x5 ^ x9, 12) ++ // x10 += x14, x6 = rotl32(x6 ^ x10, 12) ++ // x11 += x15, x7 = rotl32(x7 ^ x11, 12) ++ add v8.4s, v8.4s, v12.4s ++ add v9.4s, v9.4s, v13.4s ++ add v10.4s, v10.4s, v14.4s ++ add v11.4s, v11.4s, v15.4s ++ ++ eor v16.16b, v4.16b, v8.16b ++ eor v17.16b, v5.16b, v9.16b ++ eor v18.16b, v6.16b, v10.16b ++ eor v19.16b, v7.16b, v11.16b ++ ++ shl v4.4s, v16.4s, #12 ++ shl v5.4s, v17.4s, #12 ++ shl v6.4s, v18.4s, #12 ++ shl v7.4s, v19.4s, #12 ++ ++ sri v4.4s, v16.4s, #20 ++ sri v5.4s, v17.4s, #20 ++ sri v6.4s, v18.4s, #20 ++ sri v7.4s, v19.4s, #20 ++ ++ // x0 += x4, x12 = rotl32(x12 ^ x0, 8) ++ // x1 += x5, x13 = rotl32(x13 ^ x1, 8) ++ // x2 += x6, x14 = rotl32(x14 ^ x2, 8) ++ // x3 += x7, x15 = rotl32(x15 ^ x3, 8) ++ add v0.4s, v0.4s, v4.4s ++ add v1.4s, v1.4s, v5.4s ++ add v2.4s, v2.4s, v6.4s ++ add v3.4s, v3.4s, v7.4s ++ ++ eor v12.16b, v12.16b, v0.16b ++ eor v13.16b, v13.16b, v1.16b ++ eor v14.16b, v14.16b, v2.16b ++ eor v15.16b, v15.16b, v3.16b ++ ++ tbl v12.16b, {v12.16b}, v31.16b ++ tbl v13.16b, {v13.16b}, v31.16b ++ tbl v14.16b, {v14.16b}, v31.16b ++ tbl v15.16b, {v15.16b}, v31.16b ++ ++ // x8 += x12, x4 = rotl32(x4 ^ x8, 7) ++ // x9 += x13, x5 = rotl32(x5 ^ x9, 7) ++ // x10 += x14, x6 = rotl32(x6 ^ x10, 7) ++ // x11 += x15, x7 = rotl32(x7 ^ x11, 7) ++ add v8.4s, v8.4s, v12.4s ++ add v9.4s, v9.4s, v13.4s ++ add v10.4s, v10.4s, v14.4s ++ add v11.4s, v11.4s, v15.4s ++ ++ eor v16.16b, v4.16b, v8.16b ++ eor v17.16b, v5.16b, v9.16b ++ eor v18.16b, v6.16b, v10.16b ++ eor v19.16b, v7.16b, v11.16b ++ ++ shl v4.4s, v16.4s, #7 ++ shl v5.4s, v17.4s, #7 ++ shl v6.4s, v18.4s, #7 ++ shl v7.4s, v19.4s, #7 ++ ++ sri v4.4s, v16.4s, #25 ++ sri v5.4s, v17.4s, #25 ++ sri v6.4s, v18.4s, #25 ++ sri v7.4s, v19.4s, #25 ++ ++ // x0 += x5, x15 = rotl32(x15 ^ x0, 16) ++ // x1 += x6, x12 = rotl32(x12 ^ x1, 16) ++ // x2 += x7, x13 = rotl32(x13 ^ x2, 16) ++ // x3 += x4, x14 = rotl32(x14 ^ x3, 16) ++ add v0.4s, v0.4s, v5.4s ++ add v1.4s, v1.4s, v6.4s ++ add v2.4s, v2.4s, v7.4s ++ add v3.4s, v3.4s, v4.4s ++ ++ eor v15.16b, v15.16b, v0.16b ++ eor v12.16b, v12.16b, v1.16b ++ eor v13.16b, v13.16b, v2.16b ++ eor v14.16b, v14.16b, v3.16b ++ ++ rev32 v15.8h, v15.8h ++ rev32 v12.8h, v12.8h ++ rev32 v13.8h, v13.8h ++ rev32 v14.8h, v14.8h ++ ++ // x10 += x15, x5 = rotl32(x5 ^ x10, 12) ++ // x11 += x12, x6 = rotl32(x6 ^ x11, 12) ++ // x8 += x13, x7 = rotl32(x7 ^ x8, 12) ++ // x9 += x14, x4 = rotl32(x4 ^ x9, 12) ++ add v10.4s, v10.4s, v15.4s ++ add v11.4s, v11.4s, v12.4s ++ add v8.4s, v8.4s, v13.4s ++ add v9.4s, v9.4s, v14.4s ++ ++ eor v16.16b, v5.16b, v10.16b ++ eor v17.16b, v6.16b, v11.16b ++ eor v18.16b, v7.16b, v8.16b ++ eor v19.16b, v4.16b, v9.16b ++ ++ shl v5.4s, v16.4s, #12 ++ shl v6.4s, v17.4s, #12 ++ shl v7.4s, v18.4s, #12 ++ shl v4.4s, v19.4s, #12 ++ ++ sri v5.4s, v16.4s, #20 ++ sri v6.4s, v17.4s, #20 ++ sri v7.4s, v18.4s, #20 ++ sri v4.4s, v19.4s, #20 ++ ++ // x0 += x5, x15 = rotl32(x15 ^ x0, 8) ++ // x1 += x6, x12 = rotl32(x12 ^ x1, 8) ++ // x2 += x7, x13 = rotl32(x13 ^ x2, 8) ++ // x3 += x4, x14 = rotl32(x14 ^ x3, 8) ++ add v0.4s, v0.4s, v5.4s ++ add v1.4s, v1.4s, v6.4s ++ add v2.4s, v2.4s, v7.4s ++ add v3.4s, v3.4s, v4.4s ++ ++ eor v15.16b, v15.16b, v0.16b ++ eor v12.16b, v12.16b, v1.16b ++ eor v13.16b, v13.16b, v2.16b ++ eor v14.16b, v14.16b, v3.16b ++ ++ tbl v15.16b, {v15.16b}, v31.16b ++ tbl v12.16b, {v12.16b}, v31.16b ++ tbl v13.16b, {v13.16b}, v31.16b ++ tbl v14.16b, {v14.16b}, v31.16b ++ ++ // x10 += x15, x5 = rotl32(x5 ^ x10, 7) ++ // x11 += x12, x6 = rotl32(x6 ^ x11, 7) ++ // x8 += x13, x7 = rotl32(x7 ^ x8, 7) ++ // x9 += x14, x4 = rotl32(x4 ^ x9, 7) ++ add v10.4s, v10.4s, v15.4s ++ add v11.4s, v11.4s, v12.4s ++ add v8.4s, v8.4s, v13.4s ++ add v9.4s, v9.4s, v14.4s ++ ++ eor v16.16b, v5.16b, v10.16b ++ eor v17.16b, v6.16b, v11.16b ++ eor v18.16b, v7.16b, v8.16b ++ eor v19.16b, v4.16b, v9.16b ++ ++ shl v5.4s, v16.4s, #7 ++ shl v6.4s, v17.4s, #7 ++ shl v7.4s, v18.4s, #7 ++ shl v4.4s, v19.4s, #7 ++ ++ sri v5.4s, v16.4s, #25 ++ sri v6.4s, v17.4s, #25 ++ sri v7.4s, v18.4s, #25 ++ sri v4.4s, v19.4s, #25 ++ ++ subs x3, x3, #1 ++ b.ne .Ldoubleround4 ++ ++ ld4r {v16.4s-v19.4s}, [x0], #16 ++ ld4r {v20.4s-v23.4s}, [x0], #16 ++ ++ // x12 += counter values 0-3 ++ add v12.4s, v12.4s, v30.4s ++ ++ // x0[0-3] += s0[0] ++ // x1[0-3] += s0[1] ++ // x2[0-3] += s0[2] ++ // x3[0-3] += s0[3] ++ add v0.4s, v0.4s, v16.4s ++ add v1.4s, v1.4s, v17.4s ++ add v2.4s, v2.4s, v18.4s ++ add v3.4s, v3.4s, v19.4s ++ ++ ld4r {v24.4s-v27.4s}, [x0], #16 ++ ld4r {v28.4s-v31.4s}, [x0] ++ ++ // x4[0-3] += s1[0] ++ // x5[0-3] += s1[1] ++ // x6[0-3] += s1[2] ++ // x7[0-3] += s1[3] ++ add v4.4s, v4.4s, v20.4s ++ add v5.4s, v5.4s, v21.4s ++ add v6.4s, v6.4s, v22.4s ++ add v7.4s, v7.4s, v23.4s ++ ++ // x8[0-3] += s2[0] ++ // x9[0-3] += s2[1] ++ // x10[0-3] += s2[2] ++ // x11[0-3] += s2[3] ++ add v8.4s, v8.4s, v24.4s ++ add v9.4s, v9.4s, v25.4s ++ add v10.4s, v10.4s, v26.4s ++ add v11.4s, v11.4s, v27.4s ++ ++ // x12[0-3] += s3[0] ++ // x13[0-3] += s3[1] ++ // x14[0-3] += s3[2] ++ // x15[0-3] += s3[3] ++ add v12.4s, v12.4s, v28.4s ++ add v13.4s, v13.4s, v29.4s ++ add v14.4s, v14.4s, v30.4s ++ add v15.4s, v15.4s, v31.4s ++ ++ // interleave 32-bit words in state n, n+1 ++ zip1 v16.4s, v0.4s, v1.4s ++ zip2 v17.4s, v0.4s, v1.4s ++ zip1 v18.4s, v2.4s, v3.4s ++ zip2 v19.4s, v2.4s, v3.4s ++ zip1 v20.4s, v4.4s, v5.4s ++ zip2 v21.4s, v4.4s, v5.4s ++ zip1 v22.4s, v6.4s, v7.4s ++ zip2 v23.4s, v6.4s, v7.4s ++ zip1 v24.4s, v8.4s, v9.4s ++ zip2 v25.4s, v8.4s, v9.4s ++ zip1 v26.4s, v10.4s, v11.4s ++ zip2 v27.4s, v10.4s, v11.4s ++ zip1 v28.4s, v12.4s, v13.4s ++ zip2 v29.4s, v12.4s, v13.4s ++ zip1 v30.4s, v14.4s, v15.4s ++ zip2 v31.4s, v14.4s, v15.4s ++ ++ // interleave 64-bit words in state n, n+2 ++ zip1 v0.2d, v16.2d, v18.2d ++ zip2 v4.2d, v16.2d, v18.2d ++ zip1 v8.2d, v17.2d, v19.2d ++ zip2 v12.2d, v17.2d, v19.2d ++ ld1 {v16.16b-v19.16b}, [x2], #64 ++ ++ zip1 v1.2d, v20.2d, v22.2d ++ zip2 v5.2d, v20.2d, v22.2d ++ zip1 v9.2d, v21.2d, v23.2d ++ zip2 v13.2d, v21.2d, v23.2d ++ ld1 {v20.16b-v23.16b}, [x2], #64 ++ ++ zip1 v2.2d, v24.2d, v26.2d ++ zip2 v6.2d, v24.2d, v26.2d ++ zip1 v10.2d, v25.2d, v27.2d ++ zip2 v14.2d, v25.2d, v27.2d ++ ld1 {v24.16b-v27.16b}, [x2], #64 ++ ++ zip1 v3.2d, v28.2d, v30.2d ++ zip2 v7.2d, v28.2d, v30.2d ++ zip1 v11.2d, v29.2d, v31.2d ++ zip2 v15.2d, v29.2d, v31.2d ++ ld1 {v28.16b-v31.16b}, [x2] ++ ++ // xor with corresponding input, write to output ++ eor v16.16b, v16.16b, v0.16b ++ eor v17.16b, v17.16b, v1.16b ++ eor v18.16b, v18.16b, v2.16b ++ eor v19.16b, v19.16b, v3.16b ++ eor v20.16b, v20.16b, v4.16b ++ eor v21.16b, v21.16b, v5.16b ++ st1 {v16.16b-v19.16b}, [x1], #64 ++ eor v22.16b, v22.16b, v6.16b ++ eor v23.16b, v23.16b, v7.16b ++ eor v24.16b, v24.16b, v8.16b ++ eor v25.16b, v25.16b, v9.16b ++ st1 {v20.16b-v23.16b}, [x1], #64 ++ eor v26.16b, v26.16b, v10.16b ++ eor v27.16b, v27.16b, v11.16b ++ eor v28.16b, v28.16b, v12.16b ++ st1 {v24.16b-v27.16b}, [x1], #64 ++ eor v29.16b, v29.16b, v13.16b ++ eor v30.16b, v30.16b, v14.16b ++ eor v31.16b, v31.16b, v15.16b ++ st1 {v28.16b-v31.16b}, [x1] ++ ++ ret ++ENDPROC(chacha20_asm_4block_xor_neon) ++ ++CTRINC: .word 0, 1, 2, 3 ++ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/crypto/chacha20-neon-arm.S 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,523 @@ ++/* ++ * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions ++ * ++ * Copyright (C) 2016 Linaro, Ltd. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * Based on: ++ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions ++ * ++ * Copyright (C) 2015 Martin Willi ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ */ ++ ++#include ++ ++ .text ++ .fpu neon ++ .align 5 ++ ++ENTRY(chacha20_asm_block_xor_neon) ++ // r0: Input state matrix, s ++ // r1: 1 data block output, o ++ // r2: 1 data block input, i ++ ++ // ++ // This function encrypts one ChaCha20 block by loading the state matrix ++ // in four NEON registers. It performs matrix operation on four words in ++ // parallel, but requireds shuffling to rearrange the words after each ++ // round. ++ // ++ ++ // x0..3 = s0..3 ++ add ip, r0, #0x20 ++ vld1.32 {q0-q1}, [r0] ++ vld1.32 {q2-q3}, [ip] ++ ++ vmov q8, q0 ++ vmov q9, q1 ++ vmov q10, q2 ++ vmov q11, q3 ++ ++ mov r3, #10 ++ ++.Ldoubleround: ++ // x0 += x1, x3 = rotl32(x3 ^ x0, 16) ++ vadd.i32 q0, q0, q1 ++ veor q4, q3, q0 ++ vshl.u32 q3, q4, #16 ++ vsri.u32 q3, q4, #16 ++ ++ // x2 += x3, x1 = rotl32(x1 ^ x2, 12) ++ vadd.i32 q2, q2, q3 ++ veor q4, q1, q2 ++ vshl.u32 q1, q4, #12 ++ vsri.u32 q1, q4, #20 ++ ++ // x0 += x1, x3 = rotl32(x3 ^ x0, 8) ++ vadd.i32 q0, q0, q1 ++ veor q4, q3, q0 ++ vshl.u32 q3, q4, #8 ++ vsri.u32 q3, q4, #24 ++ ++ // x2 += x3, x1 = rotl32(x1 ^ x2, 7) ++ vadd.i32 q2, q2, q3 ++ veor q4, q1, q2 ++ vshl.u32 q1, q4, #7 ++ vsri.u32 q1, q4, #25 ++ ++ // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) ++ vext.8 q1, q1, q1, #4 ++ // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) ++ vext.8 q2, q2, q2, #8 ++ // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) ++ vext.8 q3, q3, q3, #12 ++ ++ // x0 += x1, x3 = rotl32(x3 ^ x0, 16) ++ vadd.i32 q0, q0, q1 ++ veor q4, q3, q0 ++ vshl.u32 q3, q4, #16 ++ vsri.u32 q3, q4, #16 ++ ++ // x2 += x3, x1 = rotl32(x1 ^ x2, 12) ++ vadd.i32 q2, q2, q3 ++ veor q4, q1, q2 ++ vshl.u32 q1, q4, #12 ++ vsri.u32 q1, q4, #20 ++ ++ // x0 += x1, x3 = rotl32(x3 ^ x0, 8) ++ vadd.i32 q0, q0, q1 ++ veor q4, q3, q0 ++ vshl.u32 q3, q4, #8 ++ vsri.u32 q3, q4, #24 ++ ++ // x2 += x3, x1 = rotl32(x1 ^ x2, 7) ++ vadd.i32 q2, q2, q3 ++ veor q4, q1, q2 ++ vshl.u32 q1, q4, #7 ++ vsri.u32 q1, q4, #25 ++ ++ // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) ++ vext.8 q1, q1, q1, #12 ++ // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) ++ vext.8 q2, q2, q2, #8 ++ // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) ++ vext.8 q3, q3, q3, #4 ++ ++ subs r3, r3, #1 ++ bne .Ldoubleround ++ ++ add ip, r2, #0x20 ++ vld1.8 {q4-q5}, [r2] ++ vld1.8 {q6-q7}, [ip] ++ ++ // o0 = i0 ^ (x0 + s0) ++ vadd.i32 q0, q0, q8 ++ veor q0, q0, q4 ++ ++ // o1 = i1 ^ (x1 + s1) ++ vadd.i32 q1, q1, q9 ++ veor q1, q1, q5 ++ ++ // o2 = i2 ^ (x2 + s2) ++ vadd.i32 q2, q2, q10 ++ veor q2, q2, q6 ++ ++ // o3 = i3 ^ (x3 + s3) ++ vadd.i32 q3, q3, q11 ++ veor q3, q3, q7 ++ ++ add ip, r1, #0x20 ++ vst1.8 {q0-q1}, [r1] ++ vst1.8 {q2-q3}, [ip] ++ ++ bx lr ++ENDPROC(chacha20_asm_block_xor_neon) ++ ++ .align 5 ++ENTRY(chacha20_asm_4block_xor_neon) ++ push {r4-r6, lr} ++ mov ip, sp // preserve the stack pointer ++ sub r3, sp, #0x20 // allocate a 32 byte buffer ++ bic r3, r3, #0x1f // aligned to 32 bytes ++ mov sp, r3 ++ ++ // r0: Input state matrix, s ++ // r1: 4 data blocks output, o ++ // r2: 4 data blocks input, i ++ ++ // ++ // This function encrypts four consecutive ChaCha20 blocks by loading ++ // the state matrix in NEON registers four times. The algorithm performs ++ // each operation on the corresponding word of each state matrix, hence ++ // requires no word shuffling. For final XORing step we transpose the ++ // matrix by interleaving 32- and then 64-bit words, which allows us to ++ // do XOR in NEON registers. ++ // ++ ++ // x0..15[0-3] = s0..3[0..3] ++ add r3, r0, #0x20 ++ vld1.32 {q0-q1}, [r0] ++ vld1.32 {q2-q3}, [r3] ++ ++ adr r3, CTRINC ++ vdup.32 q15, d7[1] ++ vdup.32 q14, d7[0] ++ vld1.32 {q11}, [r3, :128] ++ vdup.32 q13, d6[1] ++ vdup.32 q12, d6[0] ++ vadd.i32 q12, q12, q11 // x12 += counter values 0-3 ++ vdup.32 q11, d5[1] ++ vdup.32 q10, d5[0] ++ vdup.32 q9, d4[1] ++ vdup.32 q8, d4[0] ++ vdup.32 q7, d3[1] ++ vdup.32 q6, d3[0] ++ vdup.32 q5, d2[1] ++ vdup.32 q4, d2[0] ++ vdup.32 q3, d1[1] ++ vdup.32 q2, d1[0] ++ vdup.32 q1, d0[1] ++ vdup.32 q0, d0[0] ++ ++ mov r3, #10 ++ ++.Ldoubleround4: ++ // x0 += x4, x12 = rotl32(x12 ^ x0, 16) ++ // x1 += x5, x13 = rotl32(x13 ^ x1, 16) ++ // x2 += x6, x14 = rotl32(x14 ^ x2, 16) ++ // x3 += x7, x15 = rotl32(x15 ^ x3, 16) ++ vadd.i32 q0, q0, q4 ++ vadd.i32 q1, q1, q5 ++ vadd.i32 q2, q2, q6 ++ vadd.i32 q3, q3, q7 ++ ++ veor q12, q12, q0 ++ veor q13, q13, q1 ++ veor q14, q14, q2 ++ veor q15, q15, q3 ++ ++ vrev32.16 q12, q12 ++ vrev32.16 q13, q13 ++ vrev32.16 q14, q14 ++ vrev32.16 q15, q15 ++ ++ // x8 += x12, x4 = rotl32(x4 ^ x8, 12) ++ // x9 += x13, x5 = rotl32(x5 ^ x9, 12) ++ // x10 += x14, x6 = rotl32(x6 ^ x10, 12) ++ // x11 += x15, x7 = rotl32(x7 ^ x11, 12) ++ vadd.i32 q8, q8, q12 ++ vadd.i32 q9, q9, q13 ++ vadd.i32 q10, q10, q14 ++ vadd.i32 q11, q11, q15 ++ ++ vst1.32 {q8-q9}, [sp, :256] ++ ++ veor q8, q4, q8 ++ veor q9, q5, q9 ++ vshl.u32 q4, q8, #12 ++ vshl.u32 q5, q9, #12 ++ vsri.u32 q4, q8, #20 ++ vsri.u32 q5, q9, #20 ++ ++ veor q8, q6, q10 ++ veor q9, q7, q11 ++ vshl.u32 q6, q8, #12 ++ vshl.u32 q7, q9, #12 ++ vsri.u32 q6, q8, #20 ++ vsri.u32 q7, q9, #20 ++ ++ // x0 += x4, x12 = rotl32(x12 ^ x0, 8) ++ // x1 += x5, x13 = rotl32(x13 ^ x1, 8) ++ // x2 += x6, x14 = rotl32(x14 ^ x2, 8) ++ // x3 += x7, x15 = rotl32(x15 ^ x3, 8) ++ vadd.i32 q0, q0, q4 ++ vadd.i32 q1, q1, q5 ++ vadd.i32 q2, q2, q6 ++ vadd.i32 q3, q3, q7 ++ ++ veor q8, q12, q0 ++ veor q9, q13, q1 ++ vshl.u32 q12, q8, #8 ++ vshl.u32 q13, q9, #8 ++ vsri.u32 q12, q8, #24 ++ vsri.u32 q13, q9, #24 ++ ++ veor q8, q14, q2 ++ veor q9, q15, q3 ++ vshl.u32 q14, q8, #8 ++ vshl.u32 q15, q9, #8 ++ vsri.u32 q14, q8, #24 ++ vsri.u32 q15, q9, #24 ++ ++ vld1.32 {q8-q9}, [sp, :256] ++ ++ // x8 += x12, x4 = rotl32(x4 ^ x8, 7) ++ // x9 += x13, x5 = rotl32(x5 ^ x9, 7) ++ // x10 += x14, x6 = rotl32(x6 ^ x10, 7) ++ // x11 += x15, x7 = rotl32(x7 ^ x11, 7) ++ vadd.i32 q8, q8, q12 ++ vadd.i32 q9, q9, q13 ++ vadd.i32 q10, q10, q14 ++ vadd.i32 q11, q11, q15 ++ ++ vst1.32 {q8-q9}, [sp, :256] ++ ++ veor q8, q4, q8 ++ veor q9, q5, q9 ++ vshl.u32 q4, q8, #7 ++ vshl.u32 q5, q9, #7 ++ vsri.u32 q4, q8, #25 ++ vsri.u32 q5, q9, #25 ++ ++ veor q8, q6, q10 ++ veor q9, q7, q11 ++ vshl.u32 q6, q8, #7 ++ vshl.u32 q7, q9, #7 ++ vsri.u32 q6, q8, #25 ++ vsri.u32 q7, q9, #25 ++ ++ vld1.32 {q8-q9}, [sp, :256] ++ ++ // x0 += x5, x15 = rotl32(x15 ^ x0, 16) ++ // x1 += x6, x12 = rotl32(x12 ^ x1, 16) ++ // x2 += x7, x13 = rotl32(x13 ^ x2, 16) ++ // x3 += x4, x14 = rotl32(x14 ^ x3, 16) ++ vadd.i32 q0, q0, q5 ++ vadd.i32 q1, q1, q6 ++ vadd.i32 q2, q2, q7 ++ vadd.i32 q3, q3, q4 ++ ++ veor q15, q15, q0 ++ veor q12, q12, q1 ++ veor q13, q13, q2 ++ veor q14, q14, q3 ++ ++ vrev32.16 q15, q15 ++ vrev32.16 q12, q12 ++ vrev32.16 q13, q13 ++ vrev32.16 q14, q14 ++ ++ // x10 += x15, x5 = rotl32(x5 ^ x10, 12) ++ // x11 += x12, x6 = rotl32(x6 ^ x11, 12) ++ // x8 += x13, x7 = rotl32(x7 ^ x8, 12) ++ // x9 += x14, x4 = rotl32(x4 ^ x9, 12) ++ vadd.i32 q10, q10, q15 ++ vadd.i32 q11, q11, q12 ++ vadd.i32 q8, q8, q13 ++ vadd.i32 q9, q9, q14 ++ ++ vst1.32 {q8-q9}, [sp, :256] ++ ++ veor q8, q7, q8 ++ veor q9, q4, q9 ++ vshl.u32 q7, q8, #12 ++ vshl.u32 q4, q9, #12 ++ vsri.u32 q7, q8, #20 ++ vsri.u32 q4, q9, #20 ++ ++ veor q8, q5, q10 ++ veor q9, q6, q11 ++ vshl.u32 q5, q8, #12 ++ vshl.u32 q6, q9, #12 ++ vsri.u32 q5, q8, #20 ++ vsri.u32 q6, q9, #20 ++ ++ // x0 += x5, x15 = rotl32(x15 ^ x0, 8) ++ // x1 += x6, x12 = rotl32(x12 ^ x1, 8) ++ // x2 += x7, x13 = rotl32(x13 ^ x2, 8) ++ // x3 += x4, x14 = rotl32(x14 ^ x3, 8) ++ vadd.i32 q0, q0, q5 ++ vadd.i32 q1, q1, q6 ++ vadd.i32 q2, q2, q7 ++ vadd.i32 q3, q3, q4 ++ ++ veor q8, q15, q0 ++ veor q9, q12, q1 ++ vshl.u32 q15, q8, #8 ++ vshl.u32 q12, q9, #8 ++ vsri.u32 q15, q8, #24 ++ vsri.u32 q12, q9, #24 ++ ++ veor q8, q13, q2 ++ veor q9, q14, q3 ++ vshl.u32 q13, q8, #8 ++ vshl.u32 q14, q9, #8 ++ vsri.u32 q13, q8, #24 ++ vsri.u32 q14, q9, #24 ++ ++ vld1.32 {q8-q9}, [sp, :256] ++ ++ // x10 += x15, x5 = rotl32(x5 ^ x10, 7) ++ // x11 += x12, x6 = rotl32(x6 ^ x11, 7) ++ // x8 += x13, x7 = rotl32(x7 ^ x8, 7) ++ // x9 += x14, x4 = rotl32(x4 ^ x9, 7) ++ vadd.i32 q10, q10, q15 ++ vadd.i32 q11, q11, q12 ++ vadd.i32 q8, q8, q13 ++ vadd.i32 q9, q9, q14 ++ ++ vst1.32 {q8-q9}, [sp, :256] ++ ++ veor q8, q7, q8 ++ veor q9, q4, q9 ++ vshl.u32 q7, q8, #7 ++ vshl.u32 q4, q9, #7 ++ vsri.u32 q7, q8, #25 ++ vsri.u32 q4, q9, #25 ++ ++ veor q8, q5, q10 ++ veor q9, q6, q11 ++ vshl.u32 q5, q8, #7 ++ vshl.u32 q6, q9, #7 ++ vsri.u32 q5, q8, #25 ++ vsri.u32 q6, q9, #25 ++ ++ subs r3, r3, #1 ++ beq 0f ++ ++ vld1.32 {q8-q9}, [sp, :256] ++ b .Ldoubleround4 ++ ++ // x0[0-3] += s0[0] ++ // x1[0-3] += s0[1] ++ // x2[0-3] += s0[2] ++ // x3[0-3] += s0[3] ++0: ldmia r0!, {r3-r6} ++ vdup.32 q8, r3 ++ vdup.32 q9, r4 ++ vadd.i32 q0, q0, q8 ++ vadd.i32 q1, q1, q9 ++ vdup.32 q8, r5 ++ vdup.32 q9, r6 ++ vadd.i32 q2, q2, q8 ++ vadd.i32 q3, q3, q9 ++ ++ // x4[0-3] += s1[0] ++ // x5[0-3] += s1[1] ++ // x6[0-3] += s1[2] ++ // x7[0-3] += s1[3] ++ ldmia r0!, {r3-r6} ++ vdup.32 q8, r3 ++ vdup.32 q9, r4 ++ vadd.i32 q4, q4, q8 ++ vadd.i32 q5, q5, q9 ++ vdup.32 q8, r5 ++ vdup.32 q9, r6 ++ vadd.i32 q6, q6, q8 ++ vadd.i32 q7, q7, q9 ++ ++ // interleave 32-bit words in state n, n+1 ++ vzip.32 q0, q1 ++ vzip.32 q2, q3 ++ vzip.32 q4, q5 ++ vzip.32 q6, q7 ++ ++ // interleave 64-bit words in state n, n+2 ++ vswp d1, d4 ++ vswp d3, d6 ++ vswp d9, d12 ++ vswp d11, d14 ++ ++ // xor with corresponding input, write to output ++ vld1.8 {q8-q9}, [r2]! ++ veor q8, q8, q0 ++ veor q9, q9, q4 ++ vst1.8 {q8-q9}, [r1]! ++ ++ vld1.32 {q8-q9}, [sp, :256] ++ ++ // x8[0-3] += s2[0] ++ // x9[0-3] += s2[1] ++ // x10[0-3] += s2[2] ++ // x11[0-3] += s2[3] ++ ldmia r0!, {r3-r6} ++ vdup.32 q0, r3 ++ vdup.32 q4, r4 ++ vadd.i32 q8, q8, q0 ++ vadd.i32 q9, q9, q4 ++ vdup.32 q0, r5 ++ vdup.32 q4, r6 ++ vadd.i32 q10, q10, q0 ++ vadd.i32 q11, q11, q4 ++ ++ // x12[0-3] += s3[0] ++ // x13[0-3] += s3[1] ++ // x14[0-3] += s3[2] ++ // x15[0-3] += s3[3] ++ ldmia r0!, {r3-r6} ++ vdup.32 q0, r3 ++ vdup.32 q4, r4 ++ adr r3, CTRINC ++ vadd.i32 q12, q12, q0 ++ vld1.32 {q0}, [r3, :128] ++ vadd.i32 q13, q13, q4 ++ vadd.i32 q12, q12, q0 // x12 += counter values 0-3 ++ ++ vdup.32 q0, r5 ++ vdup.32 q4, r6 ++ vadd.i32 q14, q14, q0 ++ vadd.i32 q15, q15, q4 ++ ++ // interleave 32-bit words in state n, n+1 ++ vzip.32 q8, q9 ++ vzip.32 q10, q11 ++ vzip.32 q12, q13 ++ vzip.32 q14, q15 ++ ++ // interleave 64-bit words in state n, n+2 ++ vswp d17, d20 ++ vswp d19, d22 ++ vswp d25, d28 ++ vswp d27, d30 ++ ++ vmov q4, q1 ++ ++ vld1.8 {q0-q1}, [r2]! ++ veor q0, q0, q8 ++ veor q1, q1, q12 ++ vst1.8 {q0-q1}, [r1]! ++ ++ vld1.8 {q0-q1}, [r2]! ++ veor q0, q0, q2 ++ veor q1, q1, q6 ++ vst1.8 {q0-q1}, [r1]! ++ ++ vld1.8 {q0-q1}, [r2]! ++ veor q0, q0, q10 ++ veor q1, q1, q14 ++ vst1.8 {q0-q1}, [r1]! ++ ++ vld1.8 {q0-q1}, [r2]! ++ veor q0, q0, q4 ++ veor q1, q1, q5 ++ vst1.8 {q0-q1}, [r1]! ++ ++ vld1.8 {q0-q1}, [r2]! ++ veor q0, q0, q9 ++ veor q1, q1, q13 ++ vst1.8 {q0-q1}, [r1]! ++ ++ vld1.8 {q0-q1}, [r2]! ++ veor q0, q0, q3 ++ veor q1, q1, q7 ++ vst1.8 {q0-q1}, [r1]! ++ ++ vld1.8 {q0-q1}, [r2] ++ veor q0, q0, q11 ++ veor q1, q1, q15 ++ vst1.8 {q0-q1}, [r1] ++ ++ mov sp, ip ++ pop {r4-r6, pc} ++ENDPROC(chacha20_asm_4block_xor_neon) ++ ++ .align 4 ++CTRINC: .word 0, 1, 2, 3 +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/crypto/chacha20-ssse3-x86_64.S 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,734 @@ ++/* ++ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions ++ * ++ * Copyright (C) 2015 Martin Willi ++ * Copyright (C) 2017 Jason A. Donenfeld . All Rights Reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ */ ++ ++#include ++ ++.section .rodata.cst16.ROT8, "aM", @progbits, 16 ++.align 16 ++ROT8:.octa 0x0e0d0c0f0a09080b0605040702010003 ++.section .rodata.cst16.ROT16, "aM", @progbits, 16 ++.align 16 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 ++.section .rodata.cst16.CTRINC, "aM", @progbits, 16 ++.align 16 +CTRINC: .octa 0x00000003000000020000000100000000 ++.section .rodata.cst16.CHACONST, "aM", @progbits, 16 ++.align 16 ++CONST: .ascii "expand 32-byte k" + +.text + @@ -9574,9 +11941,5478 @@ + mov %r11,%rsp + ret +ENDPROC(chacha20_asm_4block_xor_ssse3) ---- /dev/null -+++ b/net/wireguard/crypto/poly1305-avx2-x86_64.S -@@ -0,0 +1,386 @@ ++ ++ENTRY(hchacha20_asm_ssse3) ++ # %rdi: 32 byte output key, o ++ # %rsi: 16 byte nonce, n ++ # %rdx: 32 byte input key, i ++ ++ # x0 = constant ++ movdqa CONST(%rip),%xmm0 ++ # x1, x2 = i ++ movdqu 0x00(%rdx),%xmm1 ++ movdqu 0x10(%rdx),%xmm2 ++ # x3 = n ++ movdqu 0x00(%rsi),%xmm3 ++ ++ movdqa %xmm0,%xmm8 ++ movdqa %xmm1,%xmm9 ++ movdqa %xmm2,%xmm10 ++ movdqa %xmm3,%xmm11 ++ movdqa ROT8(%rip),%xmm4 ++ movdqa ROT16(%rip),%xmm5 ++ ++ mov $10,%ecx ++ ++.Lhdoubleround: ++ ++ # x0 += x1, x3 = rotl32(x3 ^ x0, 16) ++ paddd %xmm1,%xmm0 ++ pxor %xmm0,%xmm3 ++ pshufb %xmm5,%xmm3 ++ ++ # x2 += x3, x1 = rotl32(x1 ^ x2, 12) ++ paddd %xmm3,%xmm2 ++ pxor %xmm2,%xmm1 ++ movdqa %xmm1,%xmm6 ++ pslld $12,%xmm6 ++ psrld $20,%xmm1 ++ por %xmm6,%xmm1 ++ ++ # x0 += x1, x3 = rotl32(x3 ^ x0, 8) ++ paddd %xmm1,%xmm0 ++ pxor %xmm0,%xmm3 ++ pshufb %xmm4,%xmm3 ++ ++ # x2 += x3, x1 = rotl32(x1 ^ x2, 7) ++ paddd %xmm3,%xmm2 ++ pxor %xmm2,%xmm1 ++ movdqa %xmm1,%xmm7 ++ pslld $7,%xmm7 ++ psrld $25,%xmm1 ++ por %xmm7,%xmm1 ++ ++ # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) ++ pshufd $0x39,%xmm1,%xmm1 ++ # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) ++ pshufd $0x4e,%xmm2,%xmm2 ++ # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) ++ pshufd $0x93,%xmm3,%xmm3 ++ ++ # x0 += x1, x3 = rotl32(x3 ^ x0, 16) ++ paddd %xmm1,%xmm0 ++ pxor %xmm0,%xmm3 ++ pshufb %xmm5,%xmm3 ++ ++ # x2 += x3, x1 = rotl32(x1 ^ x2, 12) ++ paddd %xmm3,%xmm2 ++ pxor %xmm2,%xmm1 ++ movdqa %xmm1,%xmm6 ++ pslld $12,%xmm6 ++ psrld $20,%xmm1 ++ por %xmm6,%xmm1 ++ ++ # x0 += x1, x3 = rotl32(x3 ^ x0, 8) ++ paddd %xmm1,%xmm0 ++ pxor %xmm0,%xmm3 ++ pshufb %xmm4,%xmm3 ++ ++ # x2 += x3, x1 = rotl32(x1 ^ x2, 7) ++ paddd %xmm3,%xmm2 ++ pxor %xmm2,%xmm1 ++ movdqa %xmm1,%xmm7 ++ pslld $7,%xmm7 ++ psrld $25,%xmm1 ++ por %xmm7,%xmm1 ++ ++ # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) ++ pshufd $0x93,%xmm1,%xmm1 ++ # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) ++ pshufd $0x4e,%xmm2,%xmm2 ++ # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) ++ pshufd $0x39,%xmm3,%xmm3 ++ ++ dec %ecx ++ jnz .Lhdoubleround ++ ++ # o0 = x0 ++ movdqu %xmm0,0x00(%rdi) ++ # o1 = x3 ++ movdqu %xmm3,0x10(%rdi) ++ ret ++ENDPROC(hchacha20_asm_ssse3) +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/crypto/curve25519-avx-x86_64.S 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,3259 @@ ++/* ++ * Copyright (C) 2017 Jason A. Donenfeld . All Rights Reserved. ++ * Based on algorithms from Tung Chou ++ */ ++ ++#include ++ ++.data ++.align 16 ++curve25519_sandy2x_v0_0: .quad 0, 0 ++curve25519_sandy2x_v1_0: .quad 1, 0 ++curve25519_sandy2x_v2_1: .quad 2, 1 ++curve25519_sandy2x_v9_0: .quad 9, 0 ++curve25519_sandy2x_v9_9: .quad 9, 9 ++curve25519_sandy2x_v19_19: .quad 19, 19 ++curve25519_sandy2x_v38_1: .quad 38, 1 ++curve25519_sandy2x_v38_38: .quad 38, 38 ++curve25519_sandy2x_v121666_121666: .quad 121666, 121666 ++curve25519_sandy2x_m25: .quad 33554431, 33554431 ++curve25519_sandy2x_m26: .quad 67108863, 67108863 ++curve25519_sandy2x_subc0: .quad 0x07FFFFDA, 0x03FFFFFE ++curve25519_sandy2x_subc2: .quad 0x07FFFFFE, 0x03FFFFFE ++curve25519_sandy2x_REDMASK51: .quad 0x0007FFFFFFFFFFFF ++ ++.text ++.align 32 ++ENTRY(curve25519_sandy2x_fe51_mul) ++ mov %rsp,%r11 ++ and $31,%r11 ++ add $96,%r11 ++ sub %r11,%rsp ++ movq %r11,0(%rsp) ++ movq %r12,8(%rsp) ++ movq %r13,16(%rsp) ++ movq %r14,24(%rsp) ++ movq %r15,32(%rsp) ++ movq %rbx,40(%rsp) ++ movq %rbp,48(%rsp) ++ movq %rdi,56(%rsp) ++ mov %rdx,%rcx ++ movq 24(%rsi),%rdx ++ imulq $19,%rdx,%rax ++ movq %rax,64(%rsp) ++ mulq 16(%rcx) ++ mov %rax,%r8 ++ mov %rdx,%r9 ++ movq 32(%rsi),%rdx ++ imulq $19,%rdx,%rax ++ movq %rax,72(%rsp) ++ mulq 8(%rcx) ++ add %rax,%r8 ++ adc %rdx,%r9 ++ movq 0(%rsi),%rax ++ mulq 0(%rcx) ++ add %rax,%r8 ++ adc %rdx,%r9 ++ movq 0(%rsi),%rax ++ mulq 8(%rcx) ++ mov %rax,%r10 ++ mov %rdx,%r11 ++ movq 0(%rsi),%rax ++ mulq 16(%rcx) ++ mov %rax,%r12 ++ mov %rdx,%r13 ++ movq 0(%rsi),%rax ++ mulq 24(%rcx) ++ mov %rax,%r14 ++ mov %rdx,%r15 ++ movq 0(%rsi),%rax ++ mulq 32(%rcx) ++ mov %rax,%rbx ++ mov %rdx,%rbp ++ movq 8(%rsi),%rax ++ mulq 0(%rcx) ++ add %rax,%r10 ++ adc %rdx,%r11 ++ movq 8(%rsi),%rax ++ mulq 8(%rcx) ++ add %rax,%r12 ++ adc %rdx,%r13 ++ movq 8(%rsi),%rax ++ mulq 16(%rcx) ++ add %rax,%r14 ++ adc %rdx,%r15 ++ movq 8(%rsi),%rax ++ mulq 24(%rcx) ++ add %rax,%rbx ++ adc %rdx,%rbp ++ movq 8(%rsi),%rdx ++ imulq $19,%rdx,%rax ++ mulq 32(%rcx) ++ add %rax,%r8 ++ adc %rdx,%r9 ++ movq 16(%rsi),%rax ++ mulq 0(%rcx) ++ add %rax,%r12 ++ adc %rdx,%r13 ++ movq 16(%rsi),%rax ++ mulq 8(%rcx) ++ add %rax,%r14 ++ adc %rdx,%r15 ++ movq 16(%rsi),%rax ++ mulq 16(%rcx) ++ add %rax,%rbx ++ adc %rdx,%rbp ++ movq 16(%rsi),%rdx ++ imulq $19,%rdx,%rax ++ mulq 24(%rcx) ++ add %rax,%r8 ++ adc %rdx,%r9 ++ movq 16(%rsi),%rdx ++ imulq $19,%rdx,%rax ++ mulq 32(%rcx) ++ add %rax,%r10 ++ adc %rdx,%r11 ++ movq 24(%rsi),%rax ++ mulq 0(%rcx) ++ add %rax,%r14 ++ adc %rdx,%r15 ++ movq 24(%rsi),%rax ++ mulq 8(%rcx) ++ add %rax,%rbx ++ adc %rdx,%rbp ++ movq 64(%rsp),%rax ++ mulq 24(%rcx) ++ add %rax,%r10 ++ adc %rdx,%r11 ++ movq 64(%rsp),%rax ++ mulq 32(%rcx) ++ add %rax,%r12 ++ adc %rdx,%r13 ++ movq 32(%rsi),%rax ++ mulq 0(%rcx) ++ add %rax,%rbx ++ adc %rdx,%rbp ++ movq 72(%rsp),%rax ++ mulq 16(%rcx) ++ add %rax,%r10 ++ adc %rdx,%r11 ++ movq 72(%rsp),%rax ++ mulq 24(%rcx) ++ add %rax,%r12 ++ adc %rdx,%r13 ++ movq 72(%rsp),%rax ++ mulq 32(%rcx) ++ add %rax,%r14 ++ adc %rdx,%r15 ++ movq curve25519_sandy2x_REDMASK51(%rip),%rsi ++ shld $13,%r8,%r9 ++ and %rsi,%r8 ++ shld $13,%r10,%r11 ++ and %rsi,%r10 ++ add %r9,%r10 ++ shld $13,%r12,%r13 ++ and %rsi,%r12 ++ add %r11,%r12 ++ shld $13,%r14,%r15 ++ and %rsi,%r14 ++ add %r13,%r14 ++ shld $13,%rbx,%rbp ++ and %rsi,%rbx ++ add %r15,%rbx ++ imulq $19,%rbp,%rdx ++ add %rdx,%r8 ++ mov %r8,%rdx ++ shr $51,%rdx ++ add %r10,%rdx ++ mov %rdx,%rcx ++ shr $51,%rdx ++ and %rsi,%r8 ++ add %r12,%rdx ++ mov %rdx,%r9 ++ shr $51,%rdx ++ and %rsi,%rcx ++ add %r14,%rdx ++ mov %rdx,%rax ++ shr $51,%rdx ++ and %rsi,%r9 ++ add %rbx,%rdx ++ mov %rdx,%r10 ++ shr $51,%rdx ++ and %rsi,%rax ++ imulq $19,%rdx,%rdx ++ add %rdx,%r8 ++ and %rsi,%r10 ++ movq %r8,0(%rdi) ++ movq %rcx,8(%rdi) ++ movq %r9,16(%rdi) ++ movq %rax,24(%rdi) ++ movq %r10,32(%rdi) ++ movq 0(%rsp),%r11 ++ movq 8(%rsp),%r12 ++ movq 16(%rsp),%r13 ++ movq 24(%rsp),%r14 ++ movq 32(%rsp),%r15 ++ movq 40(%rsp),%rbx ++ movq 48(%rsp),%rbp ++ add %r11,%rsp ++ mov %rdi,%rax ++ mov %rsi,%rdx ++ ret ++ENDPROC(curve25519_sandy2x_fe51_mul) ++ ++.align 32 ++ENTRY(curve25519_sandy2x_fe51_nsquare) ++ mov %rsp,%r11 ++ and $31,%r11 ++ add $64,%r11 ++ sub %r11,%rsp ++ movq %r11,0(%rsp) ++ movq %r12,8(%rsp) ++ movq %r13,16(%rsp) ++ movq %r14,24(%rsp) ++ movq %r15,32(%rsp) ++ movq %rbx,40(%rsp) ++ movq %rbp,48(%rsp) ++ movq 0(%rsi),%rcx ++ movq 8(%rsi),%r8 ++ movq 16(%rsi),%r9 ++ movq 24(%rsi),%rax ++ movq 32(%rsi),%rsi ++ movq %r9,16(%rdi) ++ movq %rax,24(%rdi) ++ movq %rsi,32(%rdi) ++ mov %rdx,%rsi ++ ++ .align 16 ++ .Lloop: ++ sub $1,%rsi ++ mov %rcx,%rax ++ mul %rcx ++ add %rcx,%rcx ++ mov %rax,%r9 ++ mov %rdx,%r10 ++ mov %rcx,%rax ++ mul %r8 ++ mov %rax,%r11 ++ mov %rdx,%r12 ++ mov %rcx,%rax ++ mulq 16(%rdi) ++ mov %rax,%r13 ++ mov %rdx,%r14 ++ mov %rcx,%rax ++ mulq 24(%rdi) ++ mov %rax,%r15 ++ mov %rdx,%rbx ++ mov %rcx,%rax ++ mulq 32(%rdi) ++ mov %rax,%rcx ++ mov %rdx,%rbp ++ mov %r8,%rax ++ mul %r8 ++ add %r8,%r8 ++ add %rax,%r13 ++ adc %rdx,%r14 ++ mov %r8,%rax ++ mulq 16(%rdi) ++ add %rax,%r15 ++ adc %rdx,%rbx ++ mov %r8,%rax ++ imulq $19, %r8,%r8 ++ mulq 24(%rdi) ++ add %rax,%rcx ++ adc %rdx,%rbp ++ mov %r8,%rax ++ mulq 32(%rdi) ++ add %rax,%r9 ++ adc %rdx,%r10 ++ movq 16(%rdi),%rax ++ mulq 16(%rdi) ++ add %rax,%rcx ++ adc %rdx,%rbp ++ shld $13,%rcx,%rbp ++ movq 16(%rdi),%rax ++ imulq $38, %rax,%rax ++ mulq 24(%rdi) ++ add %rax,%r9 ++ adc %rdx,%r10 ++ shld $13,%r9,%r10 ++ movq 16(%rdi),%rax ++ imulq $38, %rax,%rax ++ mulq 32(%rdi) ++ add %rax,%r11 ++ adc %rdx,%r12 ++ movq 24(%rdi),%rax ++ imulq $19, %rax,%rax ++ mulq 24(%rdi) ++ add %rax,%r11 ++ adc %rdx,%r12 ++ shld $13,%r11,%r12 ++ movq 24(%rdi),%rax ++ imulq $38, %rax,%rax ++ mulq 32(%rdi) ++ add %rax,%r13 ++ adc %rdx,%r14 ++ shld $13,%r13,%r14 ++ movq 32(%rdi),%rax ++ imulq $19, %rax,%rax ++ mulq 32(%rdi) ++ add %rax,%r15 ++ adc %rdx,%rbx ++ shld $13,%r15,%rbx ++ movq curve25519_sandy2x_REDMASK51(%rip),%rdx ++ and %rdx,%rcx ++ add %rbx,%rcx ++ and %rdx,%r9 ++ and %rdx,%r11 ++ add %r10,%r11 ++ and %rdx,%r13 ++ add %r12,%r13 ++ and %rdx,%r15 ++ add %r14,%r15 ++ imulq $19, %rbp,%rbp ++ lea (%r9,%rbp),%r9 ++ mov %r9,%rax ++ shr $51,%r9 ++ add %r11,%r9 ++ and %rdx,%rax ++ mov %r9,%r8 ++ shr $51,%r9 ++ add %r13,%r9 ++ and %rdx,%r8 ++ mov %r9,%r10 ++ shr $51,%r9 ++ add %r15,%r9 ++ and %rdx,%r10 ++ movq %r10,16(%rdi) ++ mov %r9,%r10 ++ shr $51,%r9 ++ add %rcx,%r9 ++ and %rdx,%r10 ++ movq %r10,24(%rdi) ++ mov %r9,%r10 ++ shr $51,%r9 ++ imulq $19, %r9,%r9 ++ lea (%rax,%r9),%rcx ++ and %rdx,%r10 ++ movq %r10,32(%rdi) ++ cmp $0,%rsi ++ jne .Lloop ++ ++ movq %rcx,0(%rdi) ++ movq %r8,8(%rdi) ++ movq 0(%rsp),%r11 ++ movq 8(%rsp),%r12 ++ movq 16(%rsp),%r13 ++ movq 24(%rsp),%r14 ++ movq 32(%rsp),%r15 ++ movq 40(%rsp),%rbx ++ movq 48(%rsp),%rbp ++ add %r11,%rsp ++ ret ++ENDPROC(curve25519_sandy2x_fe51_nsquare) ++ ++.align 32 ++ENTRY(curve25519_sandy2x_fe51_pack) ++ mov %rsp,%r11 ++ and $31,%r11 ++ add $32,%r11 ++ sub %r11,%rsp ++ movq %r11,0(%rsp) ++ movq %r12,8(%rsp) ++ movq 0(%rsi),%rdx ++ movq 8(%rsi),%rcx ++ movq 16(%rsi),%r8 ++ movq 24(%rsi),%r9 ++ movq 32(%rsi),%rsi ++ movq curve25519_sandy2x_REDMASK51(%rip),%rax ++ lea -18(%rax),%r10 ++ mov $3,%r11 ++ ++ .align 16 ++ .Lreduceloop: ++ mov %rdx,%r12 ++ shr $51,%r12 ++ and %rax,%rdx ++ add %r12,%rcx ++ mov %rcx,%r12 ++ shr $51,%r12 ++ and %rax,%rcx ++ add %r12,%r8 ++ mov %r8,%r12 ++ shr $51,%r12 ++ and %rax,%r8 ++ add %r12,%r9 ++ mov %r9,%r12 ++ shr $51,%r12 ++ and %rax,%r9 ++ add %r12,%rsi ++ mov %rsi,%r12 ++ shr $51,%r12 ++ and %rax,%rsi ++ imulq $19, %r12,%r12 ++ add %r12,%rdx ++ sub $1,%r11 ++ ja .Lreduceloop ++ ++ mov $1,%r12 ++ cmp %r10,%rdx ++ cmovl %r11,%r12 ++ cmp %rax,%rcx ++ cmovne %r11,%r12 ++ cmp %rax,%r8 ++ cmovne %r11,%r12 ++ cmp %rax,%r9 ++ cmovne %r11,%r12 ++ cmp %rax,%rsi ++ cmovne %r11,%r12 ++ neg %r12 ++ and %r12,%rax ++ and %r12,%r10 ++ sub %r10,%rdx ++ sub %rax,%rcx ++ sub %rax,%r8 ++ sub %rax,%r9 ++ sub %rax,%rsi ++ mov %rdx,%rax ++ and $0xFF,%eax ++ movb %al,0(%rdi) ++ mov %rdx,%rax ++ shr $8,%rax ++ and $0xFF,%eax ++ movb %al,1(%rdi) ++ mov %rdx,%rax ++ shr $16,%rax ++ and $0xFF,%eax ++ movb %al,2(%rdi) ++ mov %rdx,%rax ++ shr $24,%rax ++ and $0xFF,%eax ++ movb %al,3(%rdi) ++ mov %rdx,%rax ++ shr $32,%rax ++ and $0xFF,%eax ++ movb %al,4(%rdi) ++ mov %rdx,%rax ++ shr $40,%rax ++ and $0xFF,%eax ++ movb %al,5(%rdi) ++ mov %rdx,%rdx ++ shr $48,%rdx ++ mov %rcx,%rax ++ shl $3,%rax ++ and $0xF8,%eax ++ xor %rdx,%rax ++ movb %al,6(%rdi) ++ mov %rcx,%rdx ++ shr $5,%rdx ++ and $0xFF,%edx ++ movb %dl,7(%rdi) ++ mov %rcx,%rdx ++ shr $13,%rdx ++ and $0xFF,%edx ++ movb %dl,8(%rdi) ++ mov %rcx,%rdx ++ shr $21,%rdx ++ and $0xFF,%edx ++ movb %dl,9(%rdi) ++ mov %rcx,%rdx ++ shr $29,%rdx ++ and $0xFF,%edx ++ movb %dl,10(%rdi) ++ mov %rcx,%rdx ++ shr $37,%rdx ++ and $0xFF,%edx ++ movb %dl,11(%rdi) ++ mov %rcx,%rdx ++ shr $45,%rdx ++ mov %r8,%rcx ++ shl $6,%rcx ++ and $0xC0,%ecx ++ xor %rdx,%rcx ++ movb %cl,12(%rdi) ++ mov %r8,%rdx ++ shr $2,%rdx ++ and $0xFF,%edx ++ movb %dl,13(%rdi) ++ mov %r8,%rdx ++ shr $10,%rdx ++ and $0xFF,%edx ++ movb %dl,14(%rdi) ++ mov %r8,%rdx ++ shr $18,%rdx ++ and $0xFF,%edx ++ movb %dl,15(%rdi) ++ mov %r8,%rdx ++ shr $26,%rdx ++ and $0xFF,%edx ++ movb %dl,16(%rdi) ++ mov %r8,%rdx ++ shr $34,%rdx ++ and $0xFF,%edx ++ movb %dl,17(%rdi) ++ mov %r8,%rdx ++ shr $42,%rdx ++ movb %dl,18(%rdi) ++ mov %r8,%rdx ++ shr $50,%rdx ++ mov %r9,%rcx ++ shl $1,%rcx ++ and $0xFE,%ecx ++ xor %rdx,%rcx ++ movb %cl,19(%rdi) ++ mov %r9,%rdx ++ shr $7,%rdx ++ and $0xFF,%edx ++ movb %dl,20(%rdi) ++ mov %r9,%rdx ++ shr $15,%rdx ++ and $0xFF,%edx ++ movb %dl,21(%rdi) ++ mov %r9,%rdx ++ shr $23,%rdx ++ and $0xFF,%edx ++ movb %dl,22(%rdi) ++ mov %r9,%rdx ++ shr $31,%rdx ++ and $0xFF,%edx ++ movb %dl,23(%rdi) ++ mov %r9,%rdx ++ shr $39,%rdx ++ and $0xFF,%edx ++ movb %dl,24(%rdi) ++ mov %r9,%rdx ++ shr $47,%rdx ++ mov %rsi,%rcx ++ shl $4,%rcx ++ and $0xF0,%ecx ++ xor %rdx,%rcx ++ movb %cl,25(%rdi) ++ mov %rsi,%rdx ++ shr $4,%rdx ++ and $0xFF,%edx ++ movb %dl,26(%rdi) ++ mov %rsi,%rdx ++ shr $12,%rdx ++ and $0xFF,%edx ++ movb %dl,27(%rdi) ++ mov %rsi,%rdx ++ shr $20,%rdx ++ and $0xFF,%edx ++ movb %dl,28(%rdi) ++ mov %rsi,%rdx ++ shr $28,%rdx ++ and $0xFF,%edx ++ movb %dl,29(%rdi) ++ mov %rsi,%rdx ++ shr $36,%rdx ++ and $0xFF,%edx ++ movb %dl,30(%rdi) ++ mov %rsi,%rsi ++ shr $44,%rsi ++ movb %sil,31(%rdi) ++ movq 0(%rsp),%r11 ++ movq 8(%rsp),%r12 ++ add %r11,%rsp ++ ret ++ENDPROC(curve25519_sandy2x_fe51_pack) ++ ++.align 32 ++ENTRY(curve25519_sandy2x_ladder) ++ mov %rsp,%r11 ++ and $31,%r11 ++ add $1856,%r11 ++ sub %r11,%rsp ++ movq %r11,1824(%rsp) ++ movq %r12,1832(%rsp) ++ movq %r13,1840(%rsp) ++ movq %r14,1848(%rsp) ++ vmovdqa curve25519_sandy2x_v0_0(%rip),%xmm0 ++ vmovdqa curve25519_sandy2x_v1_0(%rip),%xmm1 ++ vmovdqu 0(%rdi),%xmm2 ++ vmovdqa %xmm2,0(%rsp) ++ vmovdqu 16(%rdi),%xmm2 ++ vmovdqa %xmm2,16(%rsp) ++ vmovdqu 32(%rdi),%xmm2 ++ vmovdqa %xmm2,32(%rsp) ++ vmovdqu 48(%rdi),%xmm2 ++ vmovdqa %xmm2,48(%rsp) ++ vmovdqu 64(%rdi),%xmm2 ++ vmovdqa %xmm2,64(%rsp) ++ vmovdqa %xmm1,80(%rsp) ++ vmovdqa %xmm0,96(%rsp) ++ vmovdqa %xmm0,112(%rsp) ++ vmovdqa %xmm0,128(%rsp) ++ vmovdqa %xmm0,144(%rsp) ++ vmovdqa %xmm1,%xmm0 ++ vpxor %xmm1,%xmm1,%xmm1 ++ vpxor %xmm2,%xmm2,%xmm2 ++ vpxor %xmm3,%xmm3,%xmm3 ++ vpxor %xmm4,%xmm4,%xmm4 ++ vpxor %xmm5,%xmm5,%xmm5 ++ vpxor %xmm6,%xmm6,%xmm6 ++ vpxor %xmm7,%xmm7,%xmm7 ++ vpxor %xmm8,%xmm8,%xmm8 ++ vpxor %xmm9,%xmm9,%xmm9 ++ vmovdqu 0(%rdi),%xmm10 ++ vmovdqa %xmm10,160(%rsp) ++ vmovdqu 16(%rdi),%xmm10 ++ vmovdqa %xmm10,176(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 ++ vmovdqa %xmm10,192(%rsp) ++ vmovdqu 32(%rdi),%xmm10 ++ vmovdqa %xmm10,208(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 ++ vmovdqa %xmm10,224(%rsp) ++ vmovdqu 48(%rdi),%xmm10 ++ vmovdqa %xmm10,240(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 ++ vmovdqa %xmm10,256(%rsp) ++ vmovdqu 64(%rdi),%xmm10 ++ vmovdqa %xmm10,272(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 ++ vmovdqa %xmm10,288(%rsp) ++ vmovdqu 8(%rdi),%xmm10 ++ vpmuludq curve25519_sandy2x_v2_1(%rip),%xmm10,%xmm10 ++ vmovdqa %xmm10,304(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 ++ vmovdqa %xmm10,320(%rsp) ++ vmovdqu 24(%rdi),%xmm10 ++ vpmuludq curve25519_sandy2x_v2_1(%rip),%xmm10,%xmm10 ++ vmovdqa %xmm10,336(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 ++ vmovdqa %xmm10,352(%rsp) ++ vmovdqu 40(%rdi),%xmm10 ++ vpmuludq curve25519_sandy2x_v2_1(%rip),%xmm10,%xmm10 ++ vmovdqa %xmm10,368(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 ++ vmovdqa %xmm10,384(%rsp) ++ vmovdqu 56(%rdi),%xmm10 ++ vpmuludq curve25519_sandy2x_v2_1(%rip),%xmm10,%xmm10 ++ vmovdqa %xmm10,400(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 ++ vmovdqa %xmm10,416(%rsp) ++ vmovdqu 0(%rdi),%xmm10 ++ vmovdqu 64(%rdi),%xmm11 ++ vblendps $12, %xmm11, %xmm10, %xmm10 ++ vpshufd $2,%xmm10,%xmm10 ++ vpmuludq curve25519_sandy2x_v38_1(%rip),%xmm10,%xmm10 ++ vmovdqa %xmm10,432(%rsp) ++ movq 0(%rsi),%rdx ++ movq 8(%rsi),%rcx ++ movq 16(%rsi),%r8 ++ movq 24(%rsi),%r9 ++ shrd $1,%rcx,%rdx ++ shrd $1,%r8,%rcx ++ shrd $1,%r9,%r8 ++ shr $1,%r9 ++ xorq 0(%rsi),%rdx ++ xorq 8(%rsi),%rcx ++ xorq 16(%rsi),%r8 ++ xorq 24(%rsi),%r9 ++ leaq 800(%rsp),%rsi ++ mov $64,%rax ++ ++ .align 16 ++ .Lladder_small_loop: ++ mov %rdx,%r10 ++ mov %rcx,%r11 ++ mov %r8,%r12 ++ mov %r9,%r13 ++ shr $1,%rdx ++ shr $1,%rcx ++ shr $1,%r8 ++ shr $1,%r9 ++ and $1,%r10d ++ and $1,%r11d ++ and $1,%r12d ++ and $1,%r13d ++ neg %r10 ++ neg %r11 ++ neg %r12 ++ neg %r13 ++ movl %r10d,0(%rsi) ++ movl %r11d,256(%rsi) ++ movl %r12d,512(%rsi) ++ movl %r13d,768(%rsi) ++ add $4,%rsi ++ sub $1,%rax ++ jne .Lladder_small_loop ++ mov $255,%rdx ++ add $760,%rsi ++ ++ .align 16 ++ .Lladder_loop: ++ sub $1,%rdx ++ vbroadcastss 0(%rsi),%xmm10 ++ sub $4,%rsi ++ vmovdqa 0(%rsp),%xmm11 ++ vmovdqa 80(%rsp),%xmm12 ++ vpxor %xmm11,%xmm0,%xmm13 ++ vpand %xmm10,%xmm13,%xmm13 ++ vpxor %xmm13,%xmm0,%xmm0 ++ vpxor %xmm13,%xmm11,%xmm11 ++ vpxor %xmm12,%xmm1,%xmm13 ++ vpand %xmm10,%xmm13,%xmm13 ++ vpxor %xmm13,%xmm1,%xmm1 ++ vpxor %xmm13,%xmm12,%xmm12 ++ vmovdqa 16(%rsp),%xmm13 ++ vmovdqa 96(%rsp),%xmm14 ++ vpxor %xmm13,%xmm2,%xmm15 ++ vpand %xmm10,%xmm15,%xmm15 ++ vpxor %xmm15,%xmm2,%xmm2 ++ vpxor %xmm15,%xmm13,%xmm13 ++ vpxor %xmm14,%xmm3,%xmm15 ++ vpand %xmm10,%xmm15,%xmm15 ++ vpxor %xmm15,%xmm3,%xmm3 ++ vpxor %xmm15,%xmm14,%xmm14 ++ vmovdqa %xmm13,0(%rsp) ++ vmovdqa %xmm14,16(%rsp) ++ vmovdqa 32(%rsp),%xmm13 ++ vmovdqa 112(%rsp),%xmm14 ++ vpxor %xmm13,%xmm4,%xmm15 ++ vpand %xmm10,%xmm15,%xmm15 ++ vpxor %xmm15,%xmm4,%xmm4 ++ vpxor %xmm15,%xmm13,%xmm13 ++ vpxor %xmm14,%xmm5,%xmm15 ++ vpand %xmm10,%xmm15,%xmm15 ++ vpxor %xmm15,%xmm5,%xmm5 ++ vpxor %xmm15,%xmm14,%xmm14 ++ vmovdqa %xmm13,32(%rsp) ++ vmovdqa %xmm14,80(%rsp) ++ vmovdqa 48(%rsp),%xmm13 ++ vmovdqa 128(%rsp),%xmm14 ++ vpxor %xmm13,%xmm6,%xmm15 ++ vpand %xmm10,%xmm15,%xmm15 ++ vpxor %xmm15,%xmm6,%xmm6 ++ vpxor %xmm15,%xmm13,%xmm13 ++ vpxor %xmm14,%xmm7,%xmm15 ++ vpand %xmm10,%xmm15,%xmm15 ++ vpxor %xmm15,%xmm7,%xmm7 ++ vpxor %xmm15,%xmm14,%xmm14 ++ vmovdqa %xmm13,48(%rsp) ++ vmovdqa %xmm14,96(%rsp) ++ vmovdqa 64(%rsp),%xmm13 ++ vmovdqa 144(%rsp),%xmm14 ++ vpxor %xmm13,%xmm8,%xmm15 ++ vpand %xmm10,%xmm15,%xmm15 ++ vpxor %xmm15,%xmm8,%xmm8 ++ vpxor %xmm15,%xmm13,%xmm13 ++ vpxor %xmm14,%xmm9,%xmm15 ++ vpand %xmm10,%xmm15,%xmm15 ++ vpxor %xmm15,%xmm9,%xmm9 ++ vpxor %xmm15,%xmm14,%xmm14 ++ vmovdqa %xmm13,64(%rsp) ++ vmovdqa %xmm14,112(%rsp) ++ vpaddq curve25519_sandy2x_subc0(%rip),%xmm11,%xmm10 ++ vpsubq %xmm12,%xmm10,%xmm10 ++ vpaddq %xmm12,%xmm11,%xmm11 ++ vpunpckhqdq %xmm10,%xmm11,%xmm12 ++ vpunpcklqdq %xmm10,%xmm11,%xmm10 ++ vpaddq %xmm1,%xmm0,%xmm11 ++ vpaddq curve25519_sandy2x_subc0(%rip),%xmm0,%xmm0 ++ vpsubq %xmm1,%xmm0,%xmm0 ++ vpunpckhqdq %xmm11,%xmm0,%xmm1 ++ vpunpcklqdq %xmm11,%xmm0,%xmm0 ++ vpmuludq %xmm0,%xmm10,%xmm11 ++ vpmuludq %xmm1,%xmm10,%xmm13 ++ vmovdqa %xmm1,128(%rsp) ++ vpaddq %xmm1,%xmm1,%xmm1 ++ vpmuludq %xmm0,%xmm12,%xmm14 ++ vmovdqa %xmm0,144(%rsp) ++ vpaddq %xmm14,%xmm13,%xmm13 ++ vpmuludq %xmm1,%xmm12,%xmm0 ++ vmovdqa %xmm1,448(%rsp) ++ vpaddq %xmm3,%xmm2,%xmm1 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm2,%xmm2 ++ vpsubq %xmm3,%xmm2,%xmm2 ++ vpunpckhqdq %xmm1,%xmm2,%xmm3 ++ vpunpcklqdq %xmm1,%xmm2,%xmm1 ++ vpmuludq %xmm1,%xmm10,%xmm2 ++ vpaddq %xmm2,%xmm0,%xmm0 ++ vpmuludq %xmm3,%xmm10,%xmm2 ++ vmovdqa %xmm3,464(%rsp) ++ vpaddq %xmm3,%xmm3,%xmm3 ++ vpmuludq %xmm1,%xmm12,%xmm14 ++ vmovdqa %xmm1,480(%rsp) ++ vpaddq %xmm14,%xmm2,%xmm2 ++ vpmuludq %xmm3,%xmm12,%xmm1 ++ vmovdqa %xmm3,496(%rsp) ++ vpaddq %xmm5,%xmm4,%xmm3 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm4,%xmm4 ++ vpsubq %xmm5,%xmm4,%xmm4 ++ vpunpckhqdq %xmm3,%xmm4,%xmm5 ++ vpunpcklqdq %xmm3,%xmm4,%xmm3 ++ vpmuludq %xmm3,%xmm10,%xmm4 ++ vpaddq %xmm4,%xmm1,%xmm1 ++ vpmuludq %xmm5,%xmm10,%xmm4 ++ vmovdqa %xmm5,512(%rsp) ++ vpaddq %xmm5,%xmm5,%xmm5 ++ vpmuludq %xmm3,%xmm12,%xmm14 ++ vmovdqa %xmm3,528(%rsp) ++ vpaddq %xmm14,%xmm4,%xmm4 ++ vpaddq %xmm7,%xmm6,%xmm3 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm6,%xmm6 ++ vpsubq %xmm7,%xmm6,%xmm6 ++ vpunpckhqdq %xmm3,%xmm6,%xmm7 ++ vpunpcklqdq %xmm3,%xmm6,%xmm3 ++ vpmuludq %xmm3,%xmm10,%xmm6 ++ vpmuludq %xmm5,%xmm12,%xmm14 ++ vmovdqa %xmm5,544(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm5,%xmm5 ++ vmovdqa %xmm5,560(%rsp) ++ vpaddq %xmm14,%xmm6,%xmm6 ++ vpmuludq %xmm7,%xmm10,%xmm5 ++ vmovdqa %xmm7,576(%rsp) ++ vpaddq %xmm7,%xmm7,%xmm7 ++ vpmuludq %xmm3,%xmm12,%xmm14 ++ vmovdqa %xmm3,592(%rsp) ++ vpaddq %xmm14,%xmm5,%xmm5 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 ++ vmovdqa %xmm3,608(%rsp) ++ vpaddq %xmm9,%xmm8,%xmm3 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm8,%xmm8 ++ vpsubq %xmm9,%xmm8,%xmm8 ++ vpunpckhqdq %xmm3,%xmm8,%xmm9 ++ vpunpcklqdq %xmm3,%xmm8,%xmm3 ++ vmovdqa %xmm3,624(%rsp) ++ vpmuludq %xmm7,%xmm12,%xmm8 ++ vmovdqa %xmm7,640(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm7,%xmm7 ++ vmovdqa %xmm7,656(%rsp) ++ vpmuludq %xmm3,%xmm10,%xmm7 ++ vpaddq %xmm7,%xmm8,%xmm8 ++ vpmuludq %xmm9,%xmm10,%xmm7 ++ vmovdqa %xmm9,672(%rsp) ++ vpaddq %xmm9,%xmm9,%xmm9 ++ vpmuludq %xmm3,%xmm12,%xmm10 ++ vpaddq %xmm10,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 ++ vmovdqa %xmm3,688(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm12,%xmm12 ++ vpmuludq %xmm9,%xmm12,%xmm3 ++ vmovdqa %xmm9,704(%rsp) ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vmovdqa 0(%rsp),%xmm3 ++ vmovdqa 16(%rsp),%xmm9 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 ++ vpsubq %xmm9,%xmm10,%xmm10 ++ vpaddq %xmm9,%xmm3,%xmm3 ++ vpunpckhqdq %xmm10,%xmm3,%xmm9 ++ vpunpcklqdq %xmm10,%xmm3,%xmm3 ++ vpmuludq 144(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm0,%xmm0 ++ vpmuludq 128(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm2,%xmm2 ++ vpmuludq 480(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm1,%xmm1 ++ vpmuludq 464(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm4,%xmm4 ++ vpmuludq 528(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm6,%xmm6 ++ vpmuludq 512(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm5,%xmm5 ++ vpmuludq 592(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm8,%xmm8 ++ vpmuludq 576(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 ++ vpmuludq 624(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm11,%xmm11 ++ vpmuludq 672(%rsp),%xmm3,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpmuludq 144(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm2,%xmm2 ++ vpmuludq 448(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm1,%xmm1 ++ vpmuludq 480(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm4,%xmm4 ++ vpmuludq 496(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpmuludq 528(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm5,%xmm5 ++ vpmuludq 544(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm8,%xmm8 ++ vpmuludq 592(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 ++ vpmuludq 640(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpmuludq 624(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpmuludq 704(%rsp),%xmm9,%xmm9 ++ vpaddq %xmm9,%xmm0,%xmm0 ++ vmovdqa 32(%rsp),%xmm3 ++ vmovdqa 80(%rsp),%xmm9 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 ++ vpsubq %xmm9,%xmm10,%xmm10 ++ vpaddq %xmm9,%xmm3,%xmm3 ++ vpunpckhqdq %xmm10,%xmm3,%xmm9 ++ vpunpcklqdq %xmm10,%xmm3,%xmm3 ++ vpmuludq 144(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm1,%xmm1 ++ vpmuludq 128(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm4,%xmm4 ++ vpmuludq 480(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm6,%xmm6 ++ vpmuludq 464(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm5,%xmm5 ++ vpmuludq 528(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm8,%xmm8 ++ vpmuludq 512(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 ++ vpmuludq 592(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm11,%xmm11 ++ vpmuludq 576(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm13,%xmm13 ++ vpmuludq 624(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm0,%xmm0 ++ vpmuludq 672(%rsp),%xmm3,%xmm3 ++ vpaddq %xmm3,%xmm2,%xmm2 ++ vpmuludq 144(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm4,%xmm4 ++ vpmuludq 448(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpmuludq 480(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm5,%xmm5 ++ vpmuludq 496(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm8,%xmm8 ++ vpmuludq 528(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 ++ vpmuludq 544(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpmuludq 592(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpmuludq 640(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm0,%xmm0 ++ vpmuludq 624(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm2,%xmm2 ++ vpmuludq 704(%rsp),%xmm9,%xmm9 ++ vpaddq %xmm9,%xmm1,%xmm1 ++ vmovdqa 48(%rsp),%xmm3 ++ vmovdqa 96(%rsp),%xmm9 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 ++ vpsubq %xmm9,%xmm10,%xmm10 ++ vpaddq %xmm9,%xmm3,%xmm3 ++ vpunpckhqdq %xmm10,%xmm3,%xmm9 ++ vpunpcklqdq %xmm10,%xmm3,%xmm3 ++ vpmuludq 144(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm6,%xmm6 ++ vpmuludq 128(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm5,%xmm5 ++ vpmuludq 480(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm8,%xmm8 ++ vpmuludq 464(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 ++ vpmuludq 528(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm11,%xmm11 ++ vpmuludq 512(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm13,%xmm13 ++ vpmuludq 592(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm0,%xmm0 ++ vpmuludq 576(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm2,%xmm2 ++ vpmuludq 624(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm1,%xmm1 ++ vpmuludq 672(%rsp),%xmm3,%xmm3 ++ vpaddq %xmm3,%xmm4,%xmm4 ++ vpmuludq 144(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm5,%xmm5 ++ vpmuludq 448(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm8,%xmm8 ++ vpmuludq 480(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 ++ vpmuludq 496(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpmuludq 528(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpmuludq 544(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm0,%xmm0 ++ vpmuludq 592(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm2,%xmm2 ++ vpmuludq 640(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm1,%xmm1 ++ vpmuludq 624(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm4,%xmm4 ++ vpmuludq 704(%rsp),%xmm9,%xmm9 ++ vpaddq %xmm9,%xmm6,%xmm6 ++ vmovdqa 64(%rsp),%xmm3 ++ vmovdqa 112(%rsp),%xmm9 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 ++ vpsubq %xmm9,%xmm10,%xmm10 ++ vpaddq %xmm9,%xmm3,%xmm3 ++ vpunpckhqdq %xmm10,%xmm3,%xmm9 ++ vpunpcklqdq %xmm10,%xmm3,%xmm3 ++ vpmuludq 144(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm8,%xmm8 ++ vpmuludq 128(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 ++ vpmuludq 480(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm11,%xmm11 ++ vpmuludq 464(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm13,%xmm13 ++ vpmuludq 528(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm0,%xmm0 ++ vpmuludq 512(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm2,%xmm2 ++ vpmuludq 592(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm1,%xmm1 ++ vpmuludq 576(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm4,%xmm4 ++ vpmuludq 624(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm6,%xmm6 ++ vpmuludq 672(%rsp),%xmm3,%xmm3 ++ vpaddq %xmm3,%xmm5,%xmm5 ++ vpmuludq 144(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 ++ vpmuludq 448(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpmuludq 480(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpmuludq 496(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm0,%xmm0 ++ vpmuludq 528(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm2,%xmm2 ++ vpmuludq 544(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm1,%xmm1 ++ vpmuludq 592(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm4,%xmm4 ++ vpmuludq 640(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpmuludq 624(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm5,%xmm5 ++ vpmuludq 704(%rsp),%xmm9,%xmm9 ++ vpaddq %xmm9,%xmm8,%xmm8 ++ vpsrlq $25,%xmm4,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpand curve25519_sandy2x_m25(%rip),%xmm4,%xmm4 ++ vpsrlq $26,%xmm11,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 ++ vpsrlq $26,%xmm6,%xmm3 ++ vpaddq %xmm3,%xmm5,%xmm5 ++ vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 ++ vpsrlq $25,%xmm13,%xmm3 ++ vpaddq %xmm3,%xmm0,%xmm0 ++ vpand curve25519_sandy2x_m25(%rip),%xmm13,%xmm13 ++ vpsrlq $25,%xmm5,%xmm3 ++ vpaddq %xmm3,%xmm8,%xmm8 ++ vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 ++ vpsrlq $26,%xmm0,%xmm3 ++ vpaddq %xmm3,%xmm2,%xmm2 ++ vpand curve25519_sandy2x_m26(%rip),%xmm0,%xmm0 ++ vpsrlq $26,%xmm8,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 ++ vpsrlq $25,%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm1,%xmm1 ++ vpand curve25519_sandy2x_m25(%rip),%xmm2,%xmm2 ++ vpsrlq $25,%xmm7,%xmm3 ++ vpsllq $4,%xmm3,%xmm9 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpsllq $1,%xmm3,%xmm3 ++ vpaddq %xmm3,%xmm9,%xmm9 ++ vpaddq %xmm9,%xmm11,%xmm11 ++ vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 ++ vpsrlq $26,%xmm1,%xmm3 ++ vpaddq %xmm3,%xmm4,%xmm4 ++ vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 ++ vpsrlq $26,%xmm11,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 ++ vpsrlq $25,%xmm4,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpand curve25519_sandy2x_m25(%rip),%xmm4,%xmm4 ++ vpunpcklqdq %xmm13,%xmm11,%xmm3 ++ vpunpckhqdq %xmm13,%xmm11,%xmm9 ++ vpaddq curve25519_sandy2x_subc0(%rip),%xmm9,%xmm10 ++ vpsubq %xmm3,%xmm10,%xmm10 ++ vpaddq %xmm9,%xmm3,%xmm3 ++ vpunpckhqdq %xmm3,%xmm10,%xmm9 ++ vpunpcklqdq %xmm3,%xmm10,%xmm10 ++ vpmuludq %xmm10,%xmm10,%xmm3 ++ vpaddq %xmm10,%xmm10,%xmm10 ++ vpmuludq %xmm9,%xmm10,%xmm11 ++ vpunpcklqdq %xmm2,%xmm0,%xmm12 ++ vpunpckhqdq %xmm2,%xmm0,%xmm0 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm0,%xmm2 ++ vpsubq %xmm12,%xmm2,%xmm2 ++ vpaddq %xmm0,%xmm12,%xmm12 ++ vpunpckhqdq %xmm12,%xmm2,%xmm0 ++ vpunpcklqdq %xmm12,%xmm2,%xmm2 ++ vpmuludq %xmm2,%xmm10,%xmm12 ++ vpaddq %xmm9,%xmm9,%xmm13 ++ vpmuludq %xmm13,%xmm9,%xmm9 ++ vpaddq %xmm9,%xmm12,%xmm12 ++ vpmuludq %xmm0,%xmm10,%xmm9 ++ vpmuludq %xmm2,%xmm13,%xmm14 ++ vpaddq %xmm14,%xmm9,%xmm9 ++ vpunpcklqdq %xmm4,%xmm1,%xmm14 ++ vpunpckhqdq %xmm4,%xmm1,%xmm1 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm1,%xmm4 ++ vpsubq %xmm14,%xmm4,%xmm4 ++ vpaddq %xmm1,%xmm14,%xmm14 ++ vpunpckhqdq %xmm14,%xmm4,%xmm1 ++ vpunpcklqdq %xmm14,%xmm4,%xmm4 ++ vmovdqa %xmm1,0(%rsp) ++ vpaddq %xmm1,%xmm1,%xmm1 ++ vmovdqa %xmm1,16(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 ++ vmovdqa %xmm1,32(%rsp) ++ vpmuludq %xmm4,%xmm10,%xmm1 ++ vpmuludq %xmm2,%xmm2,%xmm14 ++ vpaddq %xmm14,%xmm1,%xmm1 ++ vpmuludq 0(%rsp),%xmm10,%xmm14 ++ vpmuludq %xmm4,%xmm13,%xmm15 ++ vpaddq %xmm15,%xmm14,%xmm14 ++ vpunpcklqdq %xmm5,%xmm6,%xmm15 ++ vpunpckhqdq %xmm5,%xmm6,%xmm5 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm5,%xmm6 ++ vpsubq %xmm15,%xmm6,%xmm6 ++ vpaddq %xmm5,%xmm15,%xmm15 ++ vpunpckhqdq %xmm15,%xmm6,%xmm5 ++ vpunpcklqdq %xmm15,%xmm6,%xmm6 ++ vmovdqa %xmm6,48(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm6,%xmm6 ++ vmovdqa %xmm6,64(%rsp) ++ vmovdqa %xmm5,80(%rsp) ++ vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm5,%xmm5 ++ vmovdqa %xmm5,96(%rsp) ++ vpmuludq 48(%rsp),%xmm10,%xmm5 ++ vpaddq %xmm0,%xmm0,%xmm6 ++ vpmuludq %xmm6,%xmm0,%xmm0 ++ vpaddq %xmm0,%xmm5,%xmm5 ++ vpmuludq 80(%rsp),%xmm10,%xmm0 ++ vpmuludq %xmm4,%xmm6,%xmm15 ++ vpaddq %xmm15,%xmm0,%xmm0 ++ vpmuludq %xmm6,%xmm13,%xmm15 ++ vpaddq %xmm15,%xmm1,%xmm1 ++ vpmuludq %xmm6,%xmm2,%xmm15 ++ vpaddq %xmm15,%xmm14,%xmm14 ++ vpunpcklqdq %xmm7,%xmm8,%xmm15 ++ vpunpckhqdq %xmm7,%xmm8,%xmm7 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm7,%xmm8 ++ vpsubq %xmm15,%xmm8,%xmm8 ++ vpaddq %xmm7,%xmm15,%xmm15 ++ vpunpckhqdq %xmm15,%xmm8,%xmm7 ++ vpunpcklqdq %xmm15,%xmm8,%xmm8 ++ vmovdqa %xmm8,112(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm8,%xmm8 ++ vmovdqa %xmm8,448(%rsp) ++ vpmuludq 112(%rsp),%xmm10,%xmm8 ++ vpmuludq %xmm7,%xmm10,%xmm10 ++ vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm7,%xmm15 ++ vpmuludq %xmm15,%xmm7,%xmm7 ++ vpaddq %xmm7,%xmm8,%xmm8 ++ vpmuludq %xmm15,%xmm13,%xmm7 ++ vpaddq %xmm7,%xmm3,%xmm3 ++ vpmuludq %xmm15,%xmm2,%xmm7 ++ vpaddq %xmm7,%xmm11,%xmm11 ++ vpmuludq 80(%rsp),%xmm13,%xmm7 ++ vpaddq %xmm7,%xmm7,%xmm7 ++ vpaddq %xmm7,%xmm8,%xmm8 ++ vpmuludq 16(%rsp),%xmm13,%xmm7 ++ vpaddq %xmm7,%xmm5,%xmm5 ++ vpmuludq 48(%rsp),%xmm13,%xmm7 ++ vpaddq %xmm7,%xmm0,%xmm0 ++ vpmuludq 112(%rsp),%xmm13,%xmm7 ++ vpaddq %xmm7,%xmm10,%xmm10 ++ vpmuludq %xmm15,%xmm6,%xmm7 ++ vpaddq %xmm7,%xmm12,%xmm12 ++ vpmuludq %xmm15,%xmm4,%xmm7 ++ vpaddq %xmm7,%xmm9,%xmm9 ++ vpaddq %xmm2,%xmm2,%xmm2 ++ vpmuludq %xmm4,%xmm2,%xmm7 ++ vpaddq %xmm7,%xmm5,%xmm5 ++ vpmuludq 448(%rsp),%xmm2,%xmm7 ++ vpaddq %xmm7,%xmm3,%xmm3 ++ vpmuludq 448(%rsp),%xmm6,%xmm7 ++ vpaddq %xmm7,%xmm11,%xmm11 ++ vpmuludq 0(%rsp),%xmm2,%xmm7 ++ vpaddq %xmm7,%xmm0,%xmm0 ++ vpmuludq 48(%rsp),%xmm2,%xmm7 ++ vpaddq %xmm7,%xmm8,%xmm8 ++ vpmuludq 80(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpmuludq 96(%rsp),%xmm4,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpmuludq %xmm4,%xmm4,%xmm2 ++ vpaddq %xmm2,%xmm8,%xmm8 ++ vpaddq %xmm4,%xmm4,%xmm2 ++ vpmuludq 448(%rsp),%xmm2,%xmm4 ++ vpaddq %xmm4,%xmm12,%xmm12 ++ vpmuludq 16(%rsp),%xmm15,%xmm4 ++ vpaddq %xmm4,%xmm1,%xmm1 ++ vpmuludq 48(%rsp),%xmm15,%xmm4 ++ vpaddq %xmm4,%xmm14,%xmm14 ++ vpmuludq 96(%rsp),%xmm6,%xmm4 ++ vpaddq %xmm4,%xmm3,%xmm3 ++ vmovdqa 16(%rsp),%xmm4 ++ vpmuludq 448(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm9,%xmm9 ++ vpmuludq 16(%rsp),%xmm6,%xmm4 ++ vpaddq %xmm4,%xmm8,%xmm8 ++ vpmuludq 48(%rsp),%xmm6,%xmm4 ++ vpaddq %xmm4,%xmm10,%xmm10 ++ vpmuludq 80(%rsp),%xmm15,%xmm4 ++ vpaddq %xmm4,%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm5,%xmm5 ++ vpmuludq 112(%rsp),%xmm15,%xmm4 ++ vpaddq %xmm4,%xmm0,%xmm0 ++ vmovdqa 48(%rsp),%xmm4 ++ vpaddq %xmm4,%xmm4,%xmm4 ++ vpmuludq 448(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm1,%xmm1 ++ vmovdqa 80(%rsp),%xmm4 ++ vpaddq %xmm4,%xmm4,%xmm4 ++ vpmuludq 448(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm14,%xmm14 ++ vpmuludq 64(%rsp),%xmm2,%xmm4 ++ vpaddq %xmm4,%xmm3,%xmm3 ++ vmovdqa 16(%rsp),%xmm4 ++ vpmuludq 64(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm11,%xmm11 ++ vmovdqa 16(%rsp),%xmm4 ++ vpmuludq 96(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm12,%xmm12 ++ vmovdqa 48(%rsp),%xmm4 ++ vpmuludq 96(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm9,%xmm9 ++ vpmuludq 0(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vmovdqa 32(%rsp),%xmm2 ++ vpmuludq 0(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm3,%xmm3 ++ vmovdqa 64(%rsp),%xmm2 ++ vpmuludq 48(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vmovdqa 96(%rsp),%xmm2 ++ vpmuludq 80(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm1,%xmm1 ++ vmovdqa 448(%rsp),%xmm2 ++ vpmuludq 112(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpsrlq $26,%xmm3,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpand curve25519_sandy2x_m26(%rip),%xmm3,%xmm3 ++ vpsrlq $25,%xmm14,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 ++ vpsrlq $25,%xmm11,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpand curve25519_sandy2x_m25(%rip),%xmm11,%xmm11 ++ vpsrlq $26,%xmm5,%xmm2 ++ vpaddq %xmm2,%xmm0,%xmm0 ++ vpand curve25519_sandy2x_m26(%rip),%xmm5,%xmm5 ++ vpsrlq $26,%xmm12,%xmm2 ++ vpaddq %xmm2,%xmm9,%xmm9 ++ vpand curve25519_sandy2x_m26(%rip),%xmm12,%xmm12 ++ vpsrlq $25,%xmm0,%xmm2 ++ vpaddq %xmm2,%xmm8,%xmm8 ++ vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 ++ vpsrlq $25,%xmm9,%xmm2 ++ vpaddq %xmm2,%xmm1,%xmm1 ++ vpand curve25519_sandy2x_m25(%rip),%xmm9,%xmm9 ++ vpsrlq $26,%xmm8,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 ++ vpsrlq $26,%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm14,%xmm14 ++ vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 ++ vpsrlq $25,%xmm10,%xmm2 ++ vpsllq $4,%xmm2,%xmm4 ++ vpaddq %xmm2,%xmm3,%xmm3 ++ vpsllq $1,%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm3,%xmm3 ++ vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 ++ vpsrlq $25,%xmm14,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 ++ vpsrlq $26,%xmm3,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpand curve25519_sandy2x_m26(%rip),%xmm3,%xmm3 ++ vpunpckhqdq %xmm11,%xmm3,%xmm2 ++ vmovdqa %xmm2,0(%rsp) ++ vpshufd $0,%xmm3,%xmm2 ++ vpshufd $0,%xmm11,%xmm3 ++ vpmuludq 160(%rsp),%xmm2,%xmm4 ++ vpmuludq 432(%rsp),%xmm3,%xmm6 ++ vpaddq %xmm6,%xmm4,%xmm4 ++ vpmuludq 176(%rsp),%xmm2,%xmm6 ++ vpmuludq 304(%rsp),%xmm3,%xmm7 ++ vpaddq %xmm7,%xmm6,%xmm6 ++ vpmuludq 208(%rsp),%xmm2,%xmm7 ++ vpmuludq 336(%rsp),%xmm3,%xmm11 ++ vpaddq %xmm11,%xmm7,%xmm7 ++ vpmuludq 240(%rsp),%xmm2,%xmm11 ++ vpmuludq 368(%rsp),%xmm3,%xmm13 ++ vpaddq %xmm13,%xmm11,%xmm11 ++ vpmuludq 272(%rsp),%xmm2,%xmm2 ++ vpmuludq 400(%rsp),%xmm3,%xmm3 ++ vpaddq %xmm3,%xmm2,%xmm2 ++ vpunpckhqdq %xmm9,%xmm12,%xmm3 ++ vmovdqa %xmm3,16(%rsp) ++ vpshufd $0,%xmm12,%xmm3 ++ vpshufd $0,%xmm9,%xmm9 ++ vpmuludq 288(%rsp),%xmm3,%xmm12 ++ vpaddq %xmm12,%xmm4,%xmm4 ++ vpmuludq 416(%rsp),%xmm9,%xmm12 ++ vpaddq %xmm12,%xmm4,%xmm4 ++ vpmuludq 160(%rsp),%xmm3,%xmm12 ++ vpaddq %xmm12,%xmm6,%xmm6 ++ vpmuludq 432(%rsp),%xmm9,%xmm12 ++ vpaddq %xmm12,%xmm6,%xmm6 ++ vpmuludq 176(%rsp),%xmm3,%xmm12 ++ vpaddq %xmm12,%xmm7,%xmm7 ++ vpmuludq 304(%rsp),%xmm9,%xmm12 ++ vpaddq %xmm12,%xmm7,%xmm7 ++ vpmuludq 208(%rsp),%xmm3,%xmm12 ++ vpaddq %xmm12,%xmm11,%xmm11 ++ vpmuludq 336(%rsp),%xmm9,%xmm12 ++ vpaddq %xmm12,%xmm11,%xmm11 ++ vpmuludq 240(%rsp),%xmm3,%xmm3 ++ vpaddq %xmm3,%xmm2,%xmm2 ++ vpmuludq 368(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm2,%xmm2 ++ vpunpckhqdq %xmm14,%xmm1,%xmm3 ++ vmovdqa %xmm3,32(%rsp) ++ vpshufd $0,%xmm1,%xmm1 ++ vpshufd $0,%xmm14,%xmm3 ++ vpmuludq 256(%rsp),%xmm1,%xmm9 ++ vpaddq %xmm9,%xmm4,%xmm4 ++ vpmuludq 384(%rsp),%xmm3,%xmm9 ++ vpaddq %xmm9,%xmm4,%xmm4 ++ vpmuludq 288(%rsp),%xmm1,%xmm9 ++ vpaddq %xmm9,%xmm6,%xmm6 ++ vpmuludq 416(%rsp),%xmm3,%xmm9 ++ vpaddq %xmm9,%xmm6,%xmm6 ++ vpmuludq 160(%rsp),%xmm1,%xmm9 ++ vpaddq %xmm9,%xmm7,%xmm7 ++ vpmuludq 432(%rsp),%xmm3,%xmm9 ++ vpaddq %xmm9,%xmm7,%xmm7 ++ vpmuludq 176(%rsp),%xmm1,%xmm9 ++ vpaddq %xmm9,%xmm11,%xmm11 ++ vpmuludq 304(%rsp),%xmm3,%xmm9 ++ vpaddq %xmm9,%xmm11,%xmm11 ++ vpmuludq 208(%rsp),%xmm1,%xmm1 ++ vpaddq %xmm1,%xmm2,%xmm2 ++ vpmuludq 336(%rsp),%xmm3,%xmm1 ++ vpaddq %xmm1,%xmm2,%xmm2 ++ vpunpckhqdq %xmm0,%xmm5,%xmm1 ++ vmovdqa %xmm1,48(%rsp) ++ vpshufd $0,%xmm5,%xmm1 ++ vpshufd $0,%xmm0,%xmm0 ++ vpmuludq 224(%rsp),%xmm1,%xmm3 ++ vpaddq %xmm3,%xmm4,%xmm4 ++ vpmuludq 352(%rsp),%xmm0,%xmm3 ++ vpaddq %xmm3,%xmm4,%xmm4 ++ vpmuludq 256(%rsp),%xmm1,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpmuludq 384(%rsp),%xmm0,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpmuludq 288(%rsp),%xmm1,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpmuludq 416(%rsp),%xmm0,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpmuludq 160(%rsp),%xmm1,%xmm3 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpmuludq 432(%rsp),%xmm0,%xmm3 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpmuludq 176(%rsp),%xmm1,%xmm1 ++ vpaddq %xmm1,%xmm2,%xmm2 ++ vpmuludq 304(%rsp),%xmm0,%xmm0 ++ vpaddq %xmm0,%xmm2,%xmm2 ++ vpunpckhqdq %xmm10,%xmm8,%xmm0 ++ vmovdqa %xmm0,64(%rsp) ++ vpshufd $0,%xmm8,%xmm0 ++ vpshufd $0,%xmm10,%xmm1 ++ vpmuludq 192(%rsp),%xmm0,%xmm3 ++ vpaddq %xmm3,%xmm4,%xmm4 ++ vpmuludq 320(%rsp),%xmm1,%xmm3 ++ vpaddq %xmm3,%xmm4,%xmm4 ++ vpmuludq 224(%rsp),%xmm0,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpmuludq 352(%rsp),%xmm1,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpmuludq 256(%rsp),%xmm0,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpmuludq 384(%rsp),%xmm1,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpmuludq 288(%rsp),%xmm0,%xmm3 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpmuludq 416(%rsp),%xmm1,%xmm3 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpmuludq 160(%rsp),%xmm0,%xmm0 ++ vpaddq %xmm0,%xmm2,%xmm2 ++ vpmuludq 432(%rsp),%xmm1,%xmm0 ++ vpaddq %xmm0,%xmm2,%xmm2 ++ vmovdqa %xmm4,80(%rsp) ++ vmovdqa %xmm6,96(%rsp) ++ vmovdqa %xmm7,112(%rsp) ++ vmovdqa %xmm11,448(%rsp) ++ vmovdqa %xmm2,496(%rsp) ++ vmovdqa 144(%rsp),%xmm0 ++ vpmuludq %xmm0,%xmm0,%xmm1 ++ vpaddq %xmm0,%xmm0,%xmm0 ++ vmovdqa 128(%rsp),%xmm2 ++ vpmuludq %xmm2,%xmm0,%xmm3 ++ vmovdqa 480(%rsp),%xmm4 ++ vpmuludq %xmm4,%xmm0,%xmm5 ++ vmovdqa 464(%rsp),%xmm6 ++ vpmuludq %xmm6,%xmm0,%xmm7 ++ vmovdqa 528(%rsp),%xmm8 ++ vpmuludq %xmm8,%xmm0,%xmm9 ++ vpmuludq 512(%rsp),%xmm0,%xmm10 ++ vpmuludq 592(%rsp),%xmm0,%xmm11 ++ vpmuludq 576(%rsp),%xmm0,%xmm12 ++ vpmuludq 624(%rsp),%xmm0,%xmm13 ++ vmovdqa 672(%rsp),%xmm14 ++ vpmuludq %xmm14,%xmm0,%xmm0 ++ vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm14,%xmm15 ++ vpmuludq %xmm15,%xmm14,%xmm14 ++ vpaddq %xmm14,%xmm13,%xmm13 ++ vpaddq %xmm6,%xmm6,%xmm14 ++ vpmuludq %xmm14,%xmm6,%xmm6 ++ vpaddq %xmm6,%xmm11,%xmm11 ++ vpaddq %xmm2,%xmm2,%xmm6 ++ vpmuludq %xmm6,%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpmuludq %xmm15,%xmm6,%xmm2 ++ vpaddq %xmm2,%xmm1,%xmm1 ++ vpmuludq %xmm15,%xmm4,%xmm2 ++ vpaddq %xmm2,%xmm3,%xmm3 ++ vpmuludq 544(%rsp),%xmm6,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpmuludq 592(%rsp),%xmm6,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpmuludq 640(%rsp),%xmm6,%xmm2 ++ vpaddq %xmm2,%xmm13,%xmm13 ++ vpmuludq 624(%rsp),%xmm6,%xmm2 ++ vpaddq %xmm2,%xmm0,%xmm0 ++ vpmuludq %xmm4,%xmm6,%xmm2 ++ vpaddq %xmm2,%xmm7,%xmm7 ++ vpmuludq %xmm14,%xmm6,%xmm2 ++ vpaddq %xmm2,%xmm9,%xmm9 ++ vpmuludq %xmm8,%xmm6,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpmuludq %xmm15,%xmm14,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpmuludq %xmm15,%xmm8,%xmm2 ++ vpaddq %xmm2,%xmm7,%xmm7 ++ vpmuludq %xmm4,%xmm4,%xmm2 ++ vpaddq %xmm2,%xmm9,%xmm9 ++ vpmuludq %xmm14,%xmm4,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpaddq %xmm4,%xmm4,%xmm2 ++ vpmuludq %xmm8,%xmm2,%xmm4 ++ vpaddq %xmm4,%xmm11,%xmm11 ++ vpmuludq 688(%rsp),%xmm2,%xmm4 ++ vpaddq %xmm4,%xmm1,%xmm1 ++ vpmuludq 688(%rsp),%xmm14,%xmm4 ++ vpaddq %xmm4,%xmm3,%xmm3 ++ vpmuludq 512(%rsp),%xmm2,%xmm4 ++ vpaddq %xmm4,%xmm12,%xmm12 ++ vpmuludq 592(%rsp),%xmm2,%xmm4 ++ vpaddq %xmm4,%xmm13,%xmm13 ++ vpmuludq 576(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm0,%xmm0 ++ vpmuludq 656(%rsp),%xmm8,%xmm2 ++ vpaddq %xmm2,%xmm3,%xmm3 ++ vpmuludq %xmm8,%xmm14,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpmuludq %xmm8,%xmm8,%xmm2 ++ vpaddq %xmm2,%xmm13,%xmm13 ++ vpaddq %xmm8,%xmm8,%xmm2 ++ vpmuludq 688(%rsp),%xmm2,%xmm4 ++ vpaddq %xmm4,%xmm5,%xmm5 ++ vpmuludq 544(%rsp),%xmm15,%xmm4 ++ vpaddq %xmm4,%xmm9,%xmm9 ++ vpmuludq 592(%rsp),%xmm15,%xmm4 ++ vpaddq %xmm4,%xmm10,%xmm10 ++ vpmuludq 656(%rsp),%xmm14,%xmm4 ++ vpaddq %xmm4,%xmm1,%xmm1 ++ vmovdqa 544(%rsp),%xmm4 ++ vpmuludq 688(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm7,%xmm7 ++ vpmuludq 544(%rsp),%xmm14,%xmm4 ++ vpaddq %xmm4,%xmm13,%xmm13 ++ vpmuludq 592(%rsp),%xmm14,%xmm4 ++ vpaddq %xmm4,%xmm0,%xmm0 ++ vpmuludq 640(%rsp),%xmm15,%xmm4 ++ vpaddq %xmm4,%xmm11,%xmm11 ++ vpmuludq 624(%rsp),%xmm15,%xmm4 ++ vpaddq %xmm4,%xmm12,%xmm12 ++ vmovdqa 592(%rsp),%xmm4 ++ vpaddq %xmm4,%xmm4,%xmm4 ++ vpmuludq 688(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm9,%xmm9 ++ vpmuludq 608(%rsp),%xmm2,%xmm4 ++ vpaddq %xmm4,%xmm1,%xmm1 ++ vmovdqa 544(%rsp),%xmm4 ++ vpmuludq 608(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm3,%xmm3 ++ vmovdqa 544(%rsp),%xmm4 ++ vpmuludq 656(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm5,%xmm5 ++ vmovdqa 592(%rsp),%xmm4 ++ vpmuludq 656(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm7,%xmm7 ++ vmovdqa 640(%rsp),%xmm4 ++ vpmuludq 688(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm10,%xmm10 ++ vpmuludq 512(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm0,%xmm0 ++ vmovdqa 560(%rsp),%xmm2 ++ vpmuludq 512(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm1,%xmm1 ++ vmovdqa 608(%rsp),%xmm2 ++ vpmuludq 592(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vmovdqa 656(%rsp),%xmm2 ++ vpmuludq 576(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm9,%xmm9 ++ vmovdqa 688(%rsp),%xmm2 ++ vpmuludq 624(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpsrlq $26,%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm3,%xmm3 ++ vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 ++ vpsrlq $25,%xmm10,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 ++ vpsrlq $25,%xmm3,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpand curve25519_sandy2x_m25(%rip),%xmm3,%xmm3 ++ vpsrlq $26,%xmm11,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 ++ vpsrlq $26,%xmm5,%xmm2 ++ vpaddq %xmm2,%xmm7,%xmm7 ++ vpand curve25519_sandy2x_m26(%rip),%xmm5,%xmm5 ++ vpsrlq $25,%xmm12,%xmm2 ++ vpaddq %xmm2,%xmm13,%xmm13 ++ vpand curve25519_sandy2x_m25(%rip),%xmm12,%xmm12 ++ vpsrlq $25,%xmm7,%xmm2 ++ vpaddq %xmm2,%xmm9,%xmm9 ++ vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 ++ vpsrlq $26,%xmm13,%xmm2 ++ vpaddq %xmm2,%xmm0,%xmm0 ++ vpand curve25519_sandy2x_m26(%rip),%xmm13,%xmm13 ++ vpsrlq $26,%xmm9,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 ++ vpsrlq $25,%xmm0,%xmm2 ++ vpsllq $4,%xmm2,%xmm4 ++ vpaddq %xmm2,%xmm1,%xmm1 ++ vpsllq $1,%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm1,%xmm1 ++ vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 ++ vpsrlq $25,%xmm10,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 ++ vpsrlq $26,%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm3,%xmm3 ++ vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 ++ vpunpckhqdq %xmm3,%xmm1,%xmm2 ++ vpunpcklqdq %xmm3,%xmm1,%xmm1 ++ vmovdqa %xmm1,464(%rsp) ++ vpaddq curve25519_sandy2x_subc0(%rip),%xmm2,%xmm3 ++ vpsubq %xmm1,%xmm3,%xmm3 ++ vpunpckhqdq %xmm3,%xmm2,%xmm1 ++ vpunpcklqdq %xmm3,%xmm2,%xmm2 ++ vmovdqa %xmm2,480(%rsp) ++ vmovdqa %xmm1,512(%rsp) ++ vpsllq $1,%xmm1,%xmm1 ++ vmovdqa %xmm1,528(%rsp) ++ vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm3,%xmm3 ++ vmovdqa 80(%rsp),%xmm1 ++ vpunpcklqdq %xmm1,%xmm3,%xmm2 ++ vpunpckhqdq %xmm1,%xmm3,%xmm1 ++ vpunpckhqdq %xmm7,%xmm5,%xmm3 ++ vpunpcklqdq %xmm7,%xmm5,%xmm4 ++ vmovdqa %xmm4,544(%rsp) ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm5 ++ vpsubq %xmm4,%xmm5,%xmm5 ++ vpunpckhqdq %xmm5,%xmm3,%xmm4 ++ vpunpcklqdq %xmm5,%xmm3,%xmm3 ++ vmovdqa %xmm3,560(%rsp) ++ vmovdqa %xmm4,576(%rsp) ++ vpsllq $1,%xmm4,%xmm4 ++ vmovdqa %xmm4,592(%rsp) ++ vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm5,%xmm5 ++ vmovdqa 96(%rsp),%xmm3 ++ vpunpcklqdq %xmm3,%xmm5,%xmm4 ++ vpunpckhqdq %xmm3,%xmm5,%xmm3 ++ vpunpckhqdq %xmm10,%xmm9,%xmm5 ++ vpunpcklqdq %xmm10,%xmm9,%xmm6 ++ vmovdqa %xmm6,608(%rsp) ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm5,%xmm7 ++ vpsubq %xmm6,%xmm7,%xmm7 ++ vpunpckhqdq %xmm7,%xmm5,%xmm6 ++ vpunpcklqdq %xmm7,%xmm5,%xmm5 ++ vmovdqa %xmm5,624(%rsp) ++ vmovdqa %xmm6,640(%rsp) ++ vpsllq $1,%xmm6,%xmm6 ++ vmovdqa %xmm6,656(%rsp) ++ vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm7,%xmm7 ++ vmovdqa 112(%rsp),%xmm5 ++ vpunpcklqdq %xmm5,%xmm7,%xmm6 ++ vpunpckhqdq %xmm5,%xmm7,%xmm5 ++ vpunpckhqdq %xmm12,%xmm11,%xmm7 ++ vpunpcklqdq %xmm12,%xmm11,%xmm8 ++ vmovdqa %xmm8,672(%rsp) ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm7,%xmm9 ++ vpsubq %xmm8,%xmm9,%xmm9 ++ vpunpckhqdq %xmm9,%xmm7,%xmm8 ++ vpunpcklqdq %xmm9,%xmm7,%xmm7 ++ vmovdqa %xmm7,688(%rsp) ++ vmovdqa %xmm8,704(%rsp) ++ vpsllq $1,%xmm8,%xmm8 ++ vmovdqa %xmm8,720(%rsp) ++ vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm9,%xmm9 ++ vmovdqa 448(%rsp),%xmm7 ++ vpunpcklqdq %xmm7,%xmm9,%xmm8 ++ vpunpckhqdq %xmm7,%xmm9,%xmm7 ++ vpunpckhqdq %xmm0,%xmm13,%xmm9 ++ vpunpcklqdq %xmm0,%xmm13,%xmm0 ++ vmovdqa %xmm0,448(%rsp) ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm9,%xmm10 ++ vpsubq %xmm0,%xmm10,%xmm10 ++ vpunpckhqdq %xmm10,%xmm9,%xmm0 ++ vpunpcklqdq %xmm10,%xmm9,%xmm9 ++ vmovdqa %xmm9,736(%rsp) ++ vmovdqa %xmm0,752(%rsp) ++ vpsllq $1,%xmm0,%xmm0 ++ vmovdqa %xmm0,768(%rsp) ++ vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm10,%xmm10 ++ vmovdqa 496(%rsp),%xmm0 ++ vpunpcklqdq %xmm0,%xmm10,%xmm9 ++ vpunpckhqdq %xmm0,%xmm10,%xmm0 ++ vpsrlq $26,%xmm2,%xmm10 ++ vpaddq %xmm10,%xmm1,%xmm1 ++ vpand curve25519_sandy2x_m26(%rip),%xmm2,%xmm2 ++ vpsrlq $25,%xmm5,%xmm10 ++ vpaddq %xmm10,%xmm8,%xmm8 ++ vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 ++ vpsrlq $25,%xmm1,%xmm10 ++ vpaddq %xmm10,%xmm4,%xmm4 ++ vpand curve25519_sandy2x_m25(%rip),%xmm1,%xmm1 ++ vpsrlq $26,%xmm8,%xmm10 ++ vpaddq %xmm10,%xmm7,%xmm7 ++ vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 ++ vpsrlq $26,%xmm4,%xmm10 ++ vpaddq %xmm10,%xmm3,%xmm3 ++ vpand curve25519_sandy2x_m26(%rip),%xmm4,%xmm4 ++ vpsrlq $25,%xmm7,%xmm10 ++ vpaddq %xmm10,%xmm9,%xmm9 ++ vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 ++ vpsrlq $25,%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm6,%xmm6 ++ vpand curve25519_sandy2x_m25(%rip),%xmm3,%xmm3 ++ vpsrlq $26,%xmm9,%xmm10 ++ vpaddq %xmm10,%xmm0,%xmm0 ++ vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 ++ vpsrlq $26,%xmm6,%xmm10 ++ vpaddq %xmm10,%xmm5,%xmm5 ++ vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 ++ vpsrlq $25,%xmm0,%xmm10 ++ vpsllq $4,%xmm10,%xmm11 ++ vpaddq %xmm10,%xmm2,%xmm2 ++ vpsllq $1,%xmm10,%xmm10 ++ vpaddq %xmm10,%xmm11,%xmm11 ++ vpaddq %xmm11,%xmm2,%xmm2 ++ vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 ++ vpsrlq $25,%xmm5,%xmm10 ++ vpaddq %xmm10,%xmm8,%xmm8 ++ vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 ++ vpsrlq $26,%xmm2,%xmm10 ++ vpaddq %xmm10,%xmm1,%xmm1 ++ vpand curve25519_sandy2x_m26(%rip),%xmm2,%xmm2 ++ vpunpckhqdq %xmm1,%xmm2,%xmm10 ++ vmovdqa %xmm10,80(%rsp) ++ vpunpcklqdq %xmm1,%xmm2,%xmm1 ++ vpunpckhqdq %xmm3,%xmm4,%xmm2 ++ vmovdqa %xmm2,96(%rsp) ++ vpunpcklqdq %xmm3,%xmm4,%xmm2 ++ vpunpckhqdq %xmm5,%xmm6,%xmm3 ++ vmovdqa %xmm3,112(%rsp) ++ vpunpcklqdq %xmm5,%xmm6,%xmm3 ++ vpunpckhqdq %xmm7,%xmm8,%xmm4 ++ vmovdqa %xmm4,128(%rsp) ++ vpunpcklqdq %xmm7,%xmm8,%xmm4 ++ vpunpckhqdq %xmm0,%xmm9,%xmm5 ++ vmovdqa %xmm5,144(%rsp) ++ vpunpcklqdq %xmm0,%xmm9,%xmm0 ++ vmovdqa 464(%rsp),%xmm5 ++ vpaddq %xmm5,%xmm1,%xmm1 ++ vpunpcklqdq %xmm1,%xmm5,%xmm6 ++ vpunpckhqdq %xmm1,%xmm5,%xmm1 ++ vpmuludq 512(%rsp),%xmm6,%xmm5 ++ vpmuludq 480(%rsp),%xmm1,%xmm7 ++ vpaddq %xmm7,%xmm5,%xmm5 ++ vpmuludq 560(%rsp),%xmm6,%xmm7 ++ vpmuludq 528(%rsp),%xmm1,%xmm8 ++ vpaddq %xmm8,%xmm7,%xmm7 ++ vpmuludq 576(%rsp),%xmm6,%xmm8 ++ vpmuludq 560(%rsp),%xmm1,%xmm9 ++ vpaddq %xmm9,%xmm8,%xmm8 ++ vpmuludq 624(%rsp),%xmm6,%xmm9 ++ vpmuludq 592(%rsp),%xmm1,%xmm10 ++ vpaddq %xmm10,%xmm9,%xmm9 ++ vpmuludq 640(%rsp),%xmm6,%xmm10 ++ vpmuludq 624(%rsp),%xmm1,%xmm11 ++ vpaddq %xmm11,%xmm10,%xmm10 ++ vpmuludq 688(%rsp),%xmm6,%xmm11 ++ vpmuludq 656(%rsp),%xmm1,%xmm12 ++ vpaddq %xmm12,%xmm11,%xmm11 ++ vpmuludq 704(%rsp),%xmm6,%xmm12 ++ vpmuludq 688(%rsp),%xmm1,%xmm13 ++ vpaddq %xmm13,%xmm12,%xmm12 ++ vpmuludq 736(%rsp),%xmm6,%xmm13 ++ vpmuludq 720(%rsp),%xmm1,%xmm14 ++ vpaddq %xmm14,%xmm13,%xmm13 ++ vpmuludq 752(%rsp),%xmm6,%xmm14 ++ vpmuludq 736(%rsp),%xmm1,%xmm15 ++ vpaddq %xmm15,%xmm14,%xmm14 ++ vpmuludq 480(%rsp),%xmm6,%xmm6 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 ++ vpmuludq 768(%rsp),%xmm1,%xmm1 ++ vpaddq %xmm1,%xmm6,%xmm6 ++ vmovdqa 544(%rsp),%xmm1 ++ vpaddq %xmm1,%xmm2,%xmm2 ++ vpunpcklqdq %xmm2,%xmm1,%xmm15 ++ vpunpckhqdq %xmm2,%xmm1,%xmm1 ++ vpmuludq 480(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm7,%xmm7 ++ vpmuludq 512(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm8,%xmm8 ++ vpmuludq 560(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm9,%xmm9 ++ vpmuludq 576(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpmuludq 624(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpmuludq 640(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpmuludq 688(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm13,%xmm13 ++ vpmuludq 704(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm14,%xmm14 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm15,%xmm15 ++ vpmuludq 736(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm6,%xmm6 ++ vpmuludq 752(%rsp),%xmm15,%xmm15 ++ vpaddq %xmm15,%xmm5,%xmm5 ++ vpmuludq 480(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm8,%xmm8 ++ vpmuludq 528(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm9,%xmm9 ++ vpmuludq 560(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpmuludq 592(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpmuludq 624(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpmuludq 656(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm13,%xmm13 ++ vpmuludq 688(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm14,%xmm14 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 ++ vpmuludq 720(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm6,%xmm6 ++ vpmuludq 736(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpmuludq 768(%rsp),%xmm1,%xmm1 ++ vpaddq %xmm1,%xmm7,%xmm7 ++ vmovdqa 608(%rsp),%xmm1 ++ vpaddq %xmm1,%xmm3,%xmm3 ++ vpunpcklqdq %xmm3,%xmm1,%xmm2 ++ vpunpckhqdq %xmm3,%xmm1,%xmm1 ++ vpmuludq 480(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm9,%xmm9 ++ vpmuludq 512(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm10,%xmm10 ++ vpmuludq 560(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpmuludq 576(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm12,%xmm12 ++ vpmuludq 624(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpmuludq 640(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm14,%xmm14 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 ++ vpmuludq 688(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpmuludq 704(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm5,%xmm5 ++ vpmuludq 736(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpmuludq 752(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm8,%xmm8 ++ vpmuludq 480(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpmuludq 528(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpmuludq 560(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpmuludq 592(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm13,%xmm13 ++ vpmuludq 624(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm14,%xmm14 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 ++ vpmuludq 656(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm6,%xmm6 ++ vpmuludq 688(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpmuludq 720(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm7,%xmm7 ++ vpmuludq 736(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm8,%xmm8 ++ vpmuludq 768(%rsp),%xmm1,%xmm1 ++ vpaddq %xmm1,%xmm9,%xmm9 ++ vmovdqa 672(%rsp),%xmm1 ++ vpaddq %xmm1,%xmm4,%xmm4 ++ vpunpcklqdq %xmm4,%xmm1,%xmm2 ++ vpunpckhqdq %xmm4,%xmm1,%xmm1 ++ vpmuludq 480(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpmuludq 512(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm12,%xmm12 ++ vpmuludq 560(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpmuludq 576(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm14,%xmm14 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 ++ vpmuludq 624(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpmuludq 640(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm5,%xmm5 ++ vpmuludq 688(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpmuludq 704(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm8,%xmm8 ++ vpmuludq 736(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm9,%xmm9 ++ vpmuludq 752(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpmuludq 480(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpmuludq 528(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm13,%xmm13 ++ vpmuludq 560(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm14,%xmm14 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 ++ vpmuludq 592(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm6,%xmm6 ++ vpmuludq 624(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpmuludq 656(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm7,%xmm7 ++ vpmuludq 688(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm8,%xmm8 ++ vpmuludq 720(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm9,%xmm9 ++ vpmuludq 736(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpmuludq 768(%rsp),%xmm1,%xmm1 ++ vpaddq %xmm1,%xmm11,%xmm11 ++ vmovdqa 448(%rsp),%xmm1 ++ vpaddq %xmm1,%xmm0,%xmm0 ++ vpunpcklqdq %xmm0,%xmm1,%xmm2 ++ vpunpckhqdq %xmm0,%xmm1,%xmm0 ++ vpmuludq 480(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm13,%xmm13 ++ vpmuludq 512(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm14,%xmm14 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 ++ vpmuludq 560(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm6,%xmm6 ++ vpmuludq 576(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm5,%xmm5 ++ vpmuludq 624(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm7,%xmm7 ++ vpmuludq 640(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm8,%xmm8 ++ vpmuludq 688(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm9,%xmm9 ++ vpmuludq 704(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm10,%xmm10 ++ vpmuludq 736(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm11,%xmm11 ++ vpmuludq 752(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpmuludq 480(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm14,%xmm14 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm0,%xmm0 ++ vpmuludq 528(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm6,%xmm6 ++ vpmuludq 560(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm5,%xmm5 ++ vpmuludq 592(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm7,%xmm7 ++ vpmuludq 624(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm8,%xmm8 ++ vpmuludq 656(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm9,%xmm9 ++ vpmuludq 688(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm10,%xmm10 ++ vpmuludq 720(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm11,%xmm11 ++ vpmuludq 736(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm12,%xmm12 ++ vpmuludq 768(%rsp),%xmm0,%xmm0 ++ vpaddq %xmm0,%xmm13,%xmm13 ++ vpsrlq $26,%xmm6,%xmm0 ++ vpaddq %xmm0,%xmm5,%xmm5 ++ vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 ++ vpsrlq $25,%xmm10,%xmm0 ++ vpaddq %xmm0,%xmm11,%xmm11 ++ vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 ++ vpsrlq $25,%xmm5,%xmm0 ++ vpaddq %xmm0,%xmm7,%xmm7 ++ vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 ++ vpsrlq $26,%xmm11,%xmm0 ++ vpaddq %xmm0,%xmm12,%xmm12 ++ vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 ++ vpsrlq $26,%xmm7,%xmm0 ++ vpaddq %xmm0,%xmm8,%xmm8 ++ vpand curve25519_sandy2x_m26(%rip),%xmm7,%xmm7 ++ vpsrlq $25,%xmm12,%xmm0 ++ vpaddq %xmm0,%xmm13,%xmm13 ++ vpand curve25519_sandy2x_m25(%rip),%xmm12,%xmm12 ++ vpsrlq $25,%xmm8,%xmm0 ++ vpaddq %xmm0,%xmm9,%xmm9 ++ vpand curve25519_sandy2x_m25(%rip),%xmm8,%xmm8 ++ vpsrlq $26,%xmm13,%xmm0 ++ vpaddq %xmm0,%xmm14,%xmm14 ++ vpand curve25519_sandy2x_m26(%rip),%xmm13,%xmm13 ++ vpsrlq $26,%xmm9,%xmm0 ++ vpaddq %xmm0,%xmm10,%xmm10 ++ vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 ++ vpsrlq $25,%xmm14,%xmm0 ++ vpsllq $4,%xmm0,%xmm1 ++ vpaddq %xmm0,%xmm6,%xmm6 ++ vpsllq $1,%xmm0,%xmm0 ++ vpaddq %xmm0,%xmm1,%xmm1 ++ vpaddq %xmm1,%xmm6,%xmm6 ++ vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 ++ vpsrlq $25,%xmm10,%xmm0 ++ vpaddq %xmm0,%xmm11,%xmm11 ++ vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 ++ vpsrlq $26,%xmm6,%xmm0 ++ vpaddq %xmm0,%xmm5,%xmm5 ++ vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 ++ vpunpckhqdq %xmm5,%xmm6,%xmm1 ++ vpunpcklqdq %xmm5,%xmm6,%xmm0 ++ vpunpckhqdq %xmm8,%xmm7,%xmm3 ++ vpunpcklqdq %xmm8,%xmm7,%xmm2 ++ vpunpckhqdq %xmm10,%xmm9,%xmm5 ++ vpunpcklqdq %xmm10,%xmm9,%xmm4 ++ vpunpckhqdq %xmm12,%xmm11,%xmm7 ++ vpunpcklqdq %xmm12,%xmm11,%xmm6 ++ vpunpckhqdq %xmm14,%xmm13,%xmm9 ++ vpunpcklqdq %xmm14,%xmm13,%xmm8 ++ cmp $0,%rdx ++ jne .Lladder_loop ++ vmovdqu %xmm1,160(%rdi) ++ vmovdqu %xmm0,80(%rdi) ++ vmovdqu %xmm3,176(%rdi) ++ vmovdqu %xmm2,96(%rdi) ++ vmovdqu %xmm5,192(%rdi) ++ vmovdqu %xmm4,112(%rdi) ++ vmovdqu %xmm7,208(%rdi) ++ vmovdqu %xmm6,128(%rdi) ++ vmovdqu %xmm9,224(%rdi) ++ vmovdqu %xmm8,144(%rdi) ++ movq 1824(%rsp),%r11 ++ movq 1832(%rsp),%r12 ++ movq 1840(%rsp),%r13 ++ movq 1848(%rsp),%r14 ++ add %r11,%rsp ++ ret ++ENDPROC(curve25519_sandy2x_ladder) ++ ++.align 32 ++ENTRY(curve25519_sandy2x_ladder_base) ++ mov %rsp,%r11 ++ and $31,%r11 ++ add $1568,%r11 ++ sub %r11,%rsp ++ movq %r11,1536(%rsp) ++ movq %r12,1544(%rsp) ++ movq %r13,1552(%rsp) ++ vmovdqa curve25519_sandy2x_v0_0(%rip),%xmm0 ++ vmovdqa curve25519_sandy2x_v1_0(%rip),%xmm1 ++ vmovdqa curve25519_sandy2x_v9_0(%rip),%xmm2 ++ vmovdqa %xmm2,0(%rsp) ++ vmovdqa %xmm0,16(%rsp) ++ vmovdqa %xmm0,32(%rsp) ++ vmovdqa %xmm0,48(%rsp) ++ vmovdqa %xmm0,64(%rsp) ++ vmovdqa %xmm1,80(%rsp) ++ vmovdqa %xmm0,96(%rsp) ++ vmovdqa %xmm0,112(%rsp) ++ vmovdqa %xmm0,128(%rsp) ++ vmovdqa %xmm0,144(%rsp) ++ vmovdqa %xmm1,%xmm0 ++ vpxor %xmm1,%xmm1,%xmm1 ++ vpxor %xmm2,%xmm2,%xmm2 ++ vpxor %xmm3,%xmm3,%xmm3 ++ vpxor %xmm4,%xmm4,%xmm4 ++ vpxor %xmm5,%xmm5,%xmm5 ++ vpxor %xmm6,%xmm6,%xmm6 ++ vpxor %xmm7,%xmm7,%xmm7 ++ vpxor %xmm8,%xmm8,%xmm8 ++ vpxor %xmm9,%xmm9,%xmm9 ++ movq 0(%rsi),%rdx ++ movq 8(%rsi),%rcx ++ movq 16(%rsi),%r8 ++ movq 24(%rsi),%r9 ++ shrd $1,%rcx,%rdx ++ shrd $1,%r8,%rcx ++ shrd $1,%r9,%r8 ++ shr $1,%r9 ++ xorq 0(%rsi),%rdx ++ xorq 8(%rsi),%rcx ++ xorq 16(%rsi),%r8 ++ xorq 24(%rsi),%r9 ++ leaq 512(%rsp),%rsi ++ mov $64,%rax ++ ++ .align 16 ++ .Lladder_base_small_loop: ++ mov %rdx,%r10 ++ mov %rcx,%r11 ++ mov %r8,%r12 ++ mov %r9,%r13 ++ shr $1,%rdx ++ shr $1,%rcx ++ shr $1,%r8 ++ shr $1,%r9 ++ and $1,%r10d ++ and $1,%r11d ++ and $1,%r12d ++ and $1,%r13d ++ neg %r10 ++ neg %r11 ++ neg %r12 ++ neg %r13 ++ movl %r10d,0(%rsi) ++ movl %r11d,256(%rsi) ++ movl %r12d,512(%rsi) ++ movl %r13d,768(%rsi) ++ add $4,%rsi ++ sub $1,%rax ++ jne .Lladder_base_small_loop ++ mov $255,%rdx ++ add $760,%rsi ++ ++ .align 16 ++ .Lladder_base_loop: ++ sub $1,%rdx ++ vbroadcastss 0(%rsi),%xmm10 ++ sub $4,%rsi ++ vmovdqa 0(%rsp),%xmm11 ++ vmovdqa 80(%rsp),%xmm12 ++ vpxor %xmm11,%xmm0,%xmm13 ++ vpand %xmm10,%xmm13,%xmm13 ++ vpxor %xmm13,%xmm0,%xmm0 ++ vpxor %xmm13,%xmm11,%xmm11 ++ vpxor %xmm12,%xmm1,%xmm13 ++ vpand %xmm10,%xmm13,%xmm13 ++ vpxor %xmm13,%xmm1,%xmm1 ++ vpxor %xmm13,%xmm12,%xmm12 ++ vmovdqa 16(%rsp),%xmm13 ++ vmovdqa 96(%rsp),%xmm14 ++ vpxor %xmm13,%xmm2,%xmm15 ++ vpand %xmm10,%xmm15,%xmm15 ++ vpxor %xmm15,%xmm2,%xmm2 ++ vpxor %xmm15,%xmm13,%xmm13 ++ vpxor %xmm14,%xmm3,%xmm15 ++ vpand %xmm10,%xmm15,%xmm15 ++ vpxor %xmm15,%xmm3,%xmm3 ++ vpxor %xmm15,%xmm14,%xmm14 ++ vmovdqa %xmm13,0(%rsp) ++ vmovdqa %xmm14,16(%rsp) ++ vmovdqa 32(%rsp),%xmm13 ++ vmovdqa 112(%rsp),%xmm14 ++ vpxor %xmm13,%xmm4,%xmm15 ++ vpand %xmm10,%xmm15,%xmm15 ++ vpxor %xmm15,%xmm4,%xmm4 ++ vpxor %xmm15,%xmm13,%xmm13 ++ vpxor %xmm14,%xmm5,%xmm15 ++ vpand %xmm10,%xmm15,%xmm15 ++ vpxor %xmm15,%xmm5,%xmm5 ++ vpxor %xmm15,%xmm14,%xmm14 ++ vmovdqa %xmm13,32(%rsp) ++ vmovdqa %xmm14,80(%rsp) ++ vmovdqa 48(%rsp),%xmm13 ++ vmovdqa 128(%rsp),%xmm14 ++ vpxor %xmm13,%xmm6,%xmm15 ++ vpand %xmm10,%xmm15,%xmm15 ++ vpxor %xmm15,%xmm6,%xmm6 ++ vpxor %xmm15,%xmm13,%xmm13 ++ vpxor %xmm14,%xmm7,%xmm15 ++ vpand %xmm10,%xmm15,%xmm15 ++ vpxor %xmm15,%xmm7,%xmm7 ++ vpxor %xmm15,%xmm14,%xmm14 ++ vmovdqa %xmm13,48(%rsp) ++ vmovdqa %xmm14,96(%rsp) ++ vmovdqa 64(%rsp),%xmm13 ++ vmovdqa 144(%rsp),%xmm14 ++ vpxor %xmm13,%xmm8,%xmm15 ++ vpand %xmm10,%xmm15,%xmm15 ++ vpxor %xmm15,%xmm8,%xmm8 ++ vpxor %xmm15,%xmm13,%xmm13 ++ vpxor %xmm14,%xmm9,%xmm15 ++ vpand %xmm10,%xmm15,%xmm15 ++ vpxor %xmm15,%xmm9,%xmm9 ++ vpxor %xmm15,%xmm14,%xmm14 ++ vmovdqa %xmm13,64(%rsp) ++ vmovdqa %xmm14,112(%rsp) ++ vpaddq curve25519_sandy2x_subc0(%rip),%xmm11,%xmm10 ++ vpsubq %xmm12,%xmm10,%xmm10 ++ vpaddq %xmm12,%xmm11,%xmm11 ++ vpunpckhqdq %xmm10,%xmm11,%xmm12 ++ vpunpcklqdq %xmm10,%xmm11,%xmm10 ++ vpaddq %xmm1,%xmm0,%xmm11 ++ vpaddq curve25519_sandy2x_subc0(%rip),%xmm0,%xmm0 ++ vpsubq %xmm1,%xmm0,%xmm0 ++ vpunpckhqdq %xmm11,%xmm0,%xmm1 ++ vpunpcklqdq %xmm11,%xmm0,%xmm0 ++ vpmuludq %xmm0,%xmm10,%xmm11 ++ vpmuludq %xmm1,%xmm10,%xmm13 ++ vmovdqa %xmm1,128(%rsp) ++ vpaddq %xmm1,%xmm1,%xmm1 ++ vpmuludq %xmm0,%xmm12,%xmm14 ++ vmovdqa %xmm0,144(%rsp) ++ vpaddq %xmm14,%xmm13,%xmm13 ++ vpmuludq %xmm1,%xmm12,%xmm0 ++ vmovdqa %xmm1,160(%rsp) ++ vpaddq %xmm3,%xmm2,%xmm1 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm2,%xmm2 ++ vpsubq %xmm3,%xmm2,%xmm2 ++ vpunpckhqdq %xmm1,%xmm2,%xmm3 ++ vpunpcklqdq %xmm1,%xmm2,%xmm1 ++ vpmuludq %xmm1,%xmm10,%xmm2 ++ vpaddq %xmm2,%xmm0,%xmm0 ++ vpmuludq %xmm3,%xmm10,%xmm2 ++ vmovdqa %xmm3,176(%rsp) ++ vpaddq %xmm3,%xmm3,%xmm3 ++ vpmuludq %xmm1,%xmm12,%xmm14 ++ vmovdqa %xmm1,192(%rsp) ++ vpaddq %xmm14,%xmm2,%xmm2 ++ vpmuludq %xmm3,%xmm12,%xmm1 ++ vmovdqa %xmm3,208(%rsp) ++ vpaddq %xmm5,%xmm4,%xmm3 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm4,%xmm4 ++ vpsubq %xmm5,%xmm4,%xmm4 ++ vpunpckhqdq %xmm3,%xmm4,%xmm5 ++ vpunpcklqdq %xmm3,%xmm4,%xmm3 ++ vpmuludq %xmm3,%xmm10,%xmm4 ++ vpaddq %xmm4,%xmm1,%xmm1 ++ vpmuludq %xmm5,%xmm10,%xmm4 ++ vmovdqa %xmm5,224(%rsp) ++ vpaddq %xmm5,%xmm5,%xmm5 ++ vpmuludq %xmm3,%xmm12,%xmm14 ++ vmovdqa %xmm3,240(%rsp) ++ vpaddq %xmm14,%xmm4,%xmm4 ++ vpaddq %xmm7,%xmm6,%xmm3 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm6,%xmm6 ++ vpsubq %xmm7,%xmm6,%xmm6 ++ vpunpckhqdq %xmm3,%xmm6,%xmm7 ++ vpunpcklqdq %xmm3,%xmm6,%xmm3 ++ vpmuludq %xmm3,%xmm10,%xmm6 ++ vpmuludq %xmm5,%xmm12,%xmm14 ++ vmovdqa %xmm5,256(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm5,%xmm5 ++ vmovdqa %xmm5,272(%rsp) ++ vpaddq %xmm14,%xmm6,%xmm6 ++ vpmuludq %xmm7,%xmm10,%xmm5 ++ vmovdqa %xmm7,288(%rsp) ++ vpaddq %xmm7,%xmm7,%xmm7 ++ vpmuludq %xmm3,%xmm12,%xmm14 ++ vmovdqa %xmm3,304(%rsp) ++ vpaddq %xmm14,%xmm5,%xmm5 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 ++ vmovdqa %xmm3,320(%rsp) ++ vpaddq %xmm9,%xmm8,%xmm3 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm8,%xmm8 ++ vpsubq %xmm9,%xmm8,%xmm8 ++ vpunpckhqdq %xmm3,%xmm8,%xmm9 ++ vpunpcklqdq %xmm3,%xmm8,%xmm3 ++ vmovdqa %xmm3,336(%rsp) ++ vpmuludq %xmm7,%xmm12,%xmm8 ++ vmovdqa %xmm7,352(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm7,%xmm7 ++ vmovdqa %xmm7,368(%rsp) ++ vpmuludq %xmm3,%xmm10,%xmm7 ++ vpaddq %xmm7,%xmm8,%xmm8 ++ vpmuludq %xmm9,%xmm10,%xmm7 ++ vmovdqa %xmm9,384(%rsp) ++ vpaddq %xmm9,%xmm9,%xmm9 ++ vpmuludq %xmm3,%xmm12,%xmm10 ++ vpaddq %xmm10,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 ++ vmovdqa %xmm3,400(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm12,%xmm12 ++ vpmuludq %xmm9,%xmm12,%xmm3 ++ vmovdqa %xmm9,416(%rsp) ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vmovdqa 0(%rsp),%xmm3 ++ vmovdqa 16(%rsp),%xmm9 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 ++ vpsubq %xmm9,%xmm10,%xmm10 ++ vpaddq %xmm9,%xmm3,%xmm3 ++ vpunpckhqdq %xmm10,%xmm3,%xmm9 ++ vpunpcklqdq %xmm10,%xmm3,%xmm3 ++ vpmuludq 144(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm0,%xmm0 ++ vpmuludq 128(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm2,%xmm2 ++ vpmuludq 192(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm1,%xmm1 ++ vpmuludq 176(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm4,%xmm4 ++ vpmuludq 240(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm6,%xmm6 ++ vpmuludq 224(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm5,%xmm5 ++ vpmuludq 304(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm8,%xmm8 ++ vpmuludq 288(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 ++ vpmuludq 336(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm11,%xmm11 ++ vpmuludq 384(%rsp),%xmm3,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpmuludq 144(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm2,%xmm2 ++ vpmuludq 160(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm1,%xmm1 ++ vpmuludq 192(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm4,%xmm4 ++ vpmuludq 208(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpmuludq 240(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm5,%xmm5 ++ vpmuludq 256(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm8,%xmm8 ++ vpmuludq 304(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 ++ vpmuludq 352(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpmuludq 336(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpmuludq 416(%rsp),%xmm9,%xmm9 ++ vpaddq %xmm9,%xmm0,%xmm0 ++ vmovdqa 32(%rsp),%xmm3 ++ vmovdqa 80(%rsp),%xmm9 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 ++ vpsubq %xmm9,%xmm10,%xmm10 ++ vpaddq %xmm9,%xmm3,%xmm3 ++ vpunpckhqdq %xmm10,%xmm3,%xmm9 ++ vpunpcklqdq %xmm10,%xmm3,%xmm3 ++ vpmuludq 144(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm1,%xmm1 ++ vpmuludq 128(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm4,%xmm4 ++ vpmuludq 192(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm6,%xmm6 ++ vpmuludq 176(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm5,%xmm5 ++ vpmuludq 240(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm8,%xmm8 ++ vpmuludq 224(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 ++ vpmuludq 304(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm11,%xmm11 ++ vpmuludq 288(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm13,%xmm13 ++ vpmuludq 336(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm0,%xmm0 ++ vpmuludq 384(%rsp),%xmm3,%xmm3 ++ vpaddq %xmm3,%xmm2,%xmm2 ++ vpmuludq 144(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm4,%xmm4 ++ vpmuludq 160(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpmuludq 192(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm5,%xmm5 ++ vpmuludq 208(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm8,%xmm8 ++ vpmuludq 240(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 ++ vpmuludq 256(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpmuludq 304(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpmuludq 352(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm0,%xmm0 ++ vpmuludq 336(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm2,%xmm2 ++ vpmuludq 416(%rsp),%xmm9,%xmm9 ++ vpaddq %xmm9,%xmm1,%xmm1 ++ vmovdqa 48(%rsp),%xmm3 ++ vmovdqa 96(%rsp),%xmm9 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 ++ vpsubq %xmm9,%xmm10,%xmm10 ++ vpaddq %xmm9,%xmm3,%xmm3 ++ vpunpckhqdq %xmm10,%xmm3,%xmm9 ++ vpunpcklqdq %xmm10,%xmm3,%xmm3 ++ vpmuludq 144(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm6,%xmm6 ++ vpmuludq 128(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm5,%xmm5 ++ vpmuludq 192(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm8,%xmm8 ++ vpmuludq 176(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 ++ vpmuludq 240(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm11,%xmm11 ++ vpmuludq 224(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm13,%xmm13 ++ vpmuludq 304(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm0,%xmm0 ++ vpmuludq 288(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm2,%xmm2 ++ vpmuludq 336(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm1,%xmm1 ++ vpmuludq 384(%rsp),%xmm3,%xmm3 ++ vpaddq %xmm3,%xmm4,%xmm4 ++ vpmuludq 144(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm5,%xmm5 ++ vpmuludq 160(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm8,%xmm8 ++ vpmuludq 192(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 ++ vpmuludq 208(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpmuludq 240(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpmuludq 256(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm0,%xmm0 ++ vpmuludq 304(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm2,%xmm2 ++ vpmuludq 352(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm1,%xmm1 ++ vpmuludq 336(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm4,%xmm4 ++ vpmuludq 416(%rsp),%xmm9,%xmm9 ++ vpaddq %xmm9,%xmm6,%xmm6 ++ vmovdqa 64(%rsp),%xmm3 ++ vmovdqa 112(%rsp),%xmm9 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 ++ vpsubq %xmm9,%xmm10,%xmm10 ++ vpaddq %xmm9,%xmm3,%xmm3 ++ vpunpckhqdq %xmm10,%xmm3,%xmm9 ++ vpunpcklqdq %xmm10,%xmm3,%xmm3 ++ vpmuludq 144(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm8,%xmm8 ++ vpmuludq 128(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 ++ vpmuludq 192(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm11,%xmm11 ++ vpmuludq 176(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm13,%xmm13 ++ vpmuludq 240(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm0,%xmm0 ++ vpmuludq 224(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm2,%xmm2 ++ vpmuludq 304(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm1,%xmm1 ++ vpmuludq 288(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm4,%xmm4 ++ vpmuludq 336(%rsp),%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm6,%xmm6 ++ vpmuludq 384(%rsp),%xmm3,%xmm3 ++ vpaddq %xmm3,%xmm5,%xmm5 ++ vpmuludq 144(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 ++ vpmuludq 160(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpmuludq 192(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpmuludq 208(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm0,%xmm0 ++ vpmuludq 240(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm2,%xmm2 ++ vpmuludq 256(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm1,%xmm1 ++ vpmuludq 304(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm4,%xmm4 ++ vpmuludq 352(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpmuludq 336(%rsp),%xmm9,%xmm3 ++ vpaddq %xmm3,%xmm5,%xmm5 ++ vpmuludq 416(%rsp),%xmm9,%xmm9 ++ vpaddq %xmm9,%xmm8,%xmm8 ++ vpsrlq $25,%xmm4,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpand curve25519_sandy2x_m25(%rip),%xmm4,%xmm4 ++ vpsrlq $26,%xmm11,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 ++ vpsrlq $26,%xmm6,%xmm3 ++ vpaddq %xmm3,%xmm5,%xmm5 ++ vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 ++ vpsrlq $25,%xmm13,%xmm3 ++ vpaddq %xmm3,%xmm0,%xmm0 ++ vpand curve25519_sandy2x_m25(%rip),%xmm13,%xmm13 ++ vpsrlq $25,%xmm5,%xmm3 ++ vpaddq %xmm3,%xmm8,%xmm8 ++ vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 ++ vpsrlq $26,%xmm0,%xmm3 ++ vpaddq %xmm3,%xmm2,%xmm2 ++ vpand curve25519_sandy2x_m26(%rip),%xmm0,%xmm0 ++ vpsrlq $26,%xmm8,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 ++ vpsrlq $25,%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm1,%xmm1 ++ vpand curve25519_sandy2x_m25(%rip),%xmm2,%xmm2 ++ vpsrlq $25,%xmm7,%xmm3 ++ vpsllq $4,%xmm3,%xmm9 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpsllq $1,%xmm3,%xmm3 ++ vpaddq %xmm3,%xmm9,%xmm9 ++ vpaddq %xmm9,%xmm11,%xmm11 ++ vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 ++ vpsrlq $26,%xmm1,%xmm3 ++ vpaddq %xmm3,%xmm4,%xmm4 ++ vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 ++ vpsrlq $26,%xmm11,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 ++ vpsrlq $25,%xmm4,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpand curve25519_sandy2x_m25(%rip),%xmm4,%xmm4 ++ vpunpcklqdq %xmm13,%xmm11,%xmm3 ++ vpunpckhqdq %xmm13,%xmm11,%xmm9 ++ vpaddq curve25519_sandy2x_subc0(%rip),%xmm9,%xmm10 ++ vpsubq %xmm3,%xmm10,%xmm10 ++ vpaddq %xmm9,%xmm3,%xmm3 ++ vpunpckhqdq %xmm3,%xmm10,%xmm9 ++ vpunpcklqdq %xmm3,%xmm10,%xmm10 ++ vpmuludq %xmm10,%xmm10,%xmm3 ++ vpaddq %xmm10,%xmm10,%xmm10 ++ vpmuludq %xmm9,%xmm10,%xmm11 ++ vpunpcklqdq %xmm2,%xmm0,%xmm12 ++ vpunpckhqdq %xmm2,%xmm0,%xmm0 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm0,%xmm2 ++ vpsubq %xmm12,%xmm2,%xmm2 ++ vpaddq %xmm0,%xmm12,%xmm12 ++ vpunpckhqdq %xmm12,%xmm2,%xmm0 ++ vpunpcklqdq %xmm12,%xmm2,%xmm2 ++ vpmuludq %xmm2,%xmm10,%xmm12 ++ vpaddq %xmm9,%xmm9,%xmm13 ++ vpmuludq %xmm13,%xmm9,%xmm9 ++ vpaddq %xmm9,%xmm12,%xmm12 ++ vpmuludq %xmm0,%xmm10,%xmm9 ++ vpmuludq %xmm2,%xmm13,%xmm14 ++ vpaddq %xmm14,%xmm9,%xmm9 ++ vpunpcklqdq %xmm4,%xmm1,%xmm14 ++ vpunpckhqdq %xmm4,%xmm1,%xmm1 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm1,%xmm4 ++ vpsubq %xmm14,%xmm4,%xmm4 ++ vpaddq %xmm1,%xmm14,%xmm14 ++ vpunpckhqdq %xmm14,%xmm4,%xmm1 ++ vpunpcklqdq %xmm14,%xmm4,%xmm4 ++ vmovdqa %xmm1,0(%rsp) ++ vpaddq %xmm1,%xmm1,%xmm1 ++ vmovdqa %xmm1,16(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 ++ vmovdqa %xmm1,32(%rsp) ++ vpmuludq %xmm4,%xmm10,%xmm1 ++ vpmuludq %xmm2,%xmm2,%xmm14 ++ vpaddq %xmm14,%xmm1,%xmm1 ++ vpmuludq 0(%rsp),%xmm10,%xmm14 ++ vpmuludq %xmm4,%xmm13,%xmm15 ++ vpaddq %xmm15,%xmm14,%xmm14 ++ vpunpcklqdq %xmm5,%xmm6,%xmm15 ++ vpunpckhqdq %xmm5,%xmm6,%xmm5 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm5,%xmm6 ++ vpsubq %xmm15,%xmm6,%xmm6 ++ vpaddq %xmm5,%xmm15,%xmm15 ++ vpunpckhqdq %xmm15,%xmm6,%xmm5 ++ vpunpcklqdq %xmm15,%xmm6,%xmm6 ++ vmovdqa %xmm6,48(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm6,%xmm6 ++ vmovdqa %xmm6,64(%rsp) ++ vmovdqa %xmm5,80(%rsp) ++ vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm5,%xmm5 ++ vmovdqa %xmm5,96(%rsp) ++ vpmuludq 48(%rsp),%xmm10,%xmm5 ++ vpaddq %xmm0,%xmm0,%xmm6 ++ vpmuludq %xmm6,%xmm0,%xmm0 ++ vpaddq %xmm0,%xmm5,%xmm5 ++ vpmuludq 80(%rsp),%xmm10,%xmm0 ++ vpmuludq %xmm4,%xmm6,%xmm15 ++ vpaddq %xmm15,%xmm0,%xmm0 ++ vpmuludq %xmm6,%xmm13,%xmm15 ++ vpaddq %xmm15,%xmm1,%xmm1 ++ vpmuludq %xmm6,%xmm2,%xmm15 ++ vpaddq %xmm15,%xmm14,%xmm14 ++ vpunpcklqdq %xmm7,%xmm8,%xmm15 ++ vpunpckhqdq %xmm7,%xmm8,%xmm7 ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm7,%xmm8 ++ vpsubq %xmm15,%xmm8,%xmm8 ++ vpaddq %xmm7,%xmm15,%xmm15 ++ vpunpckhqdq %xmm15,%xmm8,%xmm7 ++ vpunpcklqdq %xmm15,%xmm8,%xmm8 ++ vmovdqa %xmm8,112(%rsp) ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm8,%xmm8 ++ vmovdqa %xmm8,160(%rsp) ++ vpmuludq 112(%rsp),%xmm10,%xmm8 ++ vpmuludq %xmm7,%xmm10,%xmm10 ++ vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm7,%xmm15 ++ vpmuludq %xmm15,%xmm7,%xmm7 ++ vpaddq %xmm7,%xmm8,%xmm8 ++ vpmuludq %xmm15,%xmm13,%xmm7 ++ vpaddq %xmm7,%xmm3,%xmm3 ++ vpmuludq %xmm15,%xmm2,%xmm7 ++ vpaddq %xmm7,%xmm11,%xmm11 ++ vpmuludq 80(%rsp),%xmm13,%xmm7 ++ vpaddq %xmm7,%xmm7,%xmm7 ++ vpaddq %xmm7,%xmm8,%xmm8 ++ vpmuludq 16(%rsp),%xmm13,%xmm7 ++ vpaddq %xmm7,%xmm5,%xmm5 ++ vpmuludq 48(%rsp),%xmm13,%xmm7 ++ vpaddq %xmm7,%xmm0,%xmm0 ++ vpmuludq 112(%rsp),%xmm13,%xmm7 ++ vpaddq %xmm7,%xmm10,%xmm10 ++ vpmuludq %xmm15,%xmm6,%xmm7 ++ vpaddq %xmm7,%xmm12,%xmm12 ++ vpmuludq %xmm15,%xmm4,%xmm7 ++ vpaddq %xmm7,%xmm9,%xmm9 ++ vpaddq %xmm2,%xmm2,%xmm2 ++ vpmuludq %xmm4,%xmm2,%xmm7 ++ vpaddq %xmm7,%xmm5,%xmm5 ++ vpmuludq 160(%rsp),%xmm2,%xmm7 ++ vpaddq %xmm7,%xmm3,%xmm3 ++ vpmuludq 160(%rsp),%xmm6,%xmm7 ++ vpaddq %xmm7,%xmm11,%xmm11 ++ vpmuludq 0(%rsp),%xmm2,%xmm7 ++ vpaddq %xmm7,%xmm0,%xmm0 ++ vpmuludq 48(%rsp),%xmm2,%xmm7 ++ vpaddq %xmm7,%xmm8,%xmm8 ++ vpmuludq 80(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpmuludq 96(%rsp),%xmm4,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpmuludq %xmm4,%xmm4,%xmm2 ++ vpaddq %xmm2,%xmm8,%xmm8 ++ vpaddq %xmm4,%xmm4,%xmm2 ++ vpmuludq 160(%rsp),%xmm2,%xmm4 ++ vpaddq %xmm4,%xmm12,%xmm12 ++ vpmuludq 16(%rsp),%xmm15,%xmm4 ++ vpaddq %xmm4,%xmm1,%xmm1 ++ vpmuludq 48(%rsp),%xmm15,%xmm4 ++ vpaddq %xmm4,%xmm14,%xmm14 ++ vpmuludq 96(%rsp),%xmm6,%xmm4 ++ vpaddq %xmm4,%xmm3,%xmm3 ++ vmovdqa 16(%rsp),%xmm4 ++ vpmuludq 160(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm9,%xmm9 ++ vpmuludq 16(%rsp),%xmm6,%xmm4 ++ vpaddq %xmm4,%xmm8,%xmm8 ++ vpmuludq 48(%rsp),%xmm6,%xmm4 ++ vpaddq %xmm4,%xmm10,%xmm10 ++ vpmuludq 80(%rsp),%xmm15,%xmm4 ++ vpaddq %xmm4,%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm5,%xmm5 ++ vpmuludq 112(%rsp),%xmm15,%xmm4 ++ vpaddq %xmm4,%xmm0,%xmm0 ++ vmovdqa 48(%rsp),%xmm4 ++ vpaddq %xmm4,%xmm4,%xmm4 ++ vpmuludq 160(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm1,%xmm1 ++ vmovdqa 80(%rsp),%xmm4 ++ vpaddq %xmm4,%xmm4,%xmm4 ++ vpmuludq 160(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm14,%xmm14 ++ vpmuludq 64(%rsp),%xmm2,%xmm4 ++ vpaddq %xmm4,%xmm3,%xmm3 ++ vmovdqa 16(%rsp),%xmm4 ++ vpmuludq 64(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm11,%xmm11 ++ vmovdqa 16(%rsp),%xmm4 ++ vpmuludq 96(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm12,%xmm12 ++ vmovdqa 48(%rsp),%xmm4 ++ vpmuludq 96(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm9,%xmm9 ++ vpmuludq 0(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vmovdqa 32(%rsp),%xmm2 ++ vpmuludq 0(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm3,%xmm3 ++ vmovdqa 64(%rsp),%xmm2 ++ vpmuludq 48(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vmovdqa 96(%rsp),%xmm2 ++ vpmuludq 80(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm1,%xmm1 ++ vmovdqa 160(%rsp),%xmm2 ++ vpmuludq 112(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpsrlq $26,%xmm3,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpand curve25519_sandy2x_m26(%rip),%xmm3,%xmm3 ++ vpsrlq $25,%xmm14,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 ++ vpsrlq $25,%xmm11,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpand curve25519_sandy2x_m25(%rip),%xmm11,%xmm11 ++ vpsrlq $26,%xmm5,%xmm2 ++ vpaddq %xmm2,%xmm0,%xmm0 ++ vpand curve25519_sandy2x_m26(%rip),%xmm5,%xmm5 ++ vpsrlq $26,%xmm12,%xmm2 ++ vpaddq %xmm2,%xmm9,%xmm9 ++ vpand curve25519_sandy2x_m26(%rip),%xmm12,%xmm12 ++ vpsrlq $25,%xmm0,%xmm2 ++ vpaddq %xmm2,%xmm8,%xmm8 ++ vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 ++ vpsrlq $25,%xmm9,%xmm2 ++ vpaddq %xmm2,%xmm1,%xmm1 ++ vpand curve25519_sandy2x_m25(%rip),%xmm9,%xmm9 ++ vpsrlq $26,%xmm8,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 ++ vpsrlq $26,%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm14,%xmm14 ++ vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 ++ vpsrlq $25,%xmm10,%xmm2 ++ vpsllq $4,%xmm2,%xmm4 ++ vpaddq %xmm2,%xmm3,%xmm3 ++ vpsllq $1,%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm3,%xmm3 ++ vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 ++ vpsrlq $25,%xmm14,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 ++ vpsrlq $26,%xmm3,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpand curve25519_sandy2x_m26(%rip),%xmm3,%xmm3 ++ vpunpckhqdq %xmm11,%xmm3,%xmm2 ++ vmovdqa %xmm2,0(%rsp) ++ vpunpcklqdq %xmm11,%xmm3,%xmm2 ++ vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm2,%xmm2 ++ vmovdqa %xmm2,80(%rsp) ++ vpunpckhqdq %xmm9,%xmm12,%xmm2 ++ vmovdqa %xmm2,16(%rsp) ++ vpunpcklqdq %xmm9,%xmm12,%xmm2 ++ vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm2,%xmm2 ++ vmovdqa %xmm2,96(%rsp) ++ vpunpckhqdq %xmm14,%xmm1,%xmm2 ++ vmovdqa %xmm2,32(%rsp) ++ vpunpcklqdq %xmm14,%xmm1,%xmm1 ++ vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm1,%xmm1 ++ vmovdqa %xmm1,112(%rsp) ++ vpunpckhqdq %xmm0,%xmm5,%xmm1 ++ vmovdqa %xmm1,48(%rsp) ++ vpunpcklqdq %xmm0,%xmm5,%xmm0 ++ vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm0,%xmm0 ++ vmovdqa %xmm0,160(%rsp) ++ vpunpckhqdq %xmm10,%xmm8,%xmm0 ++ vmovdqa %xmm0,64(%rsp) ++ vpunpcklqdq %xmm10,%xmm8,%xmm0 ++ vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm0,%xmm0 ++ vmovdqa %xmm0,208(%rsp) ++ vmovdqa 144(%rsp),%xmm0 ++ vpmuludq %xmm0,%xmm0,%xmm1 ++ vpaddq %xmm0,%xmm0,%xmm0 ++ vmovdqa 128(%rsp),%xmm2 ++ vpmuludq %xmm2,%xmm0,%xmm3 ++ vmovdqa 192(%rsp),%xmm4 ++ vpmuludq %xmm4,%xmm0,%xmm5 ++ vmovdqa 176(%rsp),%xmm6 ++ vpmuludq %xmm6,%xmm0,%xmm7 ++ vmovdqa 240(%rsp),%xmm8 ++ vpmuludq %xmm8,%xmm0,%xmm9 ++ vpmuludq 224(%rsp),%xmm0,%xmm10 ++ vpmuludq 304(%rsp),%xmm0,%xmm11 ++ vpmuludq 288(%rsp),%xmm0,%xmm12 ++ vpmuludq 336(%rsp),%xmm0,%xmm13 ++ vmovdqa 384(%rsp),%xmm14 ++ vpmuludq %xmm14,%xmm0,%xmm0 ++ vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm14,%xmm15 ++ vpmuludq %xmm15,%xmm14,%xmm14 ++ vpaddq %xmm14,%xmm13,%xmm13 ++ vpaddq %xmm6,%xmm6,%xmm14 ++ vpmuludq %xmm14,%xmm6,%xmm6 ++ vpaddq %xmm6,%xmm11,%xmm11 ++ vpaddq %xmm2,%xmm2,%xmm6 ++ vpmuludq %xmm6,%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpmuludq %xmm15,%xmm6,%xmm2 ++ vpaddq %xmm2,%xmm1,%xmm1 ++ vpmuludq %xmm15,%xmm4,%xmm2 ++ vpaddq %xmm2,%xmm3,%xmm3 ++ vpmuludq 256(%rsp),%xmm6,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpmuludq 304(%rsp),%xmm6,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpmuludq 352(%rsp),%xmm6,%xmm2 ++ vpaddq %xmm2,%xmm13,%xmm13 ++ vpmuludq 336(%rsp),%xmm6,%xmm2 ++ vpaddq %xmm2,%xmm0,%xmm0 ++ vpmuludq %xmm4,%xmm6,%xmm2 ++ vpaddq %xmm2,%xmm7,%xmm7 ++ vpmuludq %xmm14,%xmm6,%xmm2 ++ vpaddq %xmm2,%xmm9,%xmm9 ++ vpmuludq %xmm8,%xmm6,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpmuludq %xmm15,%xmm14,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpmuludq %xmm15,%xmm8,%xmm2 ++ vpaddq %xmm2,%xmm7,%xmm7 ++ vpmuludq %xmm4,%xmm4,%xmm2 ++ vpaddq %xmm2,%xmm9,%xmm9 ++ vpmuludq %xmm14,%xmm4,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpaddq %xmm4,%xmm4,%xmm2 ++ vpmuludq %xmm8,%xmm2,%xmm4 ++ vpaddq %xmm4,%xmm11,%xmm11 ++ vpmuludq 400(%rsp),%xmm2,%xmm4 ++ vpaddq %xmm4,%xmm1,%xmm1 ++ vpmuludq 400(%rsp),%xmm14,%xmm4 ++ vpaddq %xmm4,%xmm3,%xmm3 ++ vpmuludq 224(%rsp),%xmm2,%xmm4 ++ vpaddq %xmm4,%xmm12,%xmm12 ++ vpmuludq 304(%rsp),%xmm2,%xmm4 ++ vpaddq %xmm4,%xmm13,%xmm13 ++ vpmuludq 288(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm0,%xmm0 ++ vpmuludq 368(%rsp),%xmm8,%xmm2 ++ vpaddq %xmm2,%xmm3,%xmm3 ++ vpmuludq %xmm8,%xmm14,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpmuludq %xmm8,%xmm8,%xmm2 ++ vpaddq %xmm2,%xmm13,%xmm13 ++ vpaddq %xmm8,%xmm8,%xmm2 ++ vpmuludq 400(%rsp),%xmm2,%xmm4 ++ vpaddq %xmm4,%xmm5,%xmm5 ++ vpmuludq 256(%rsp),%xmm15,%xmm4 ++ vpaddq %xmm4,%xmm9,%xmm9 ++ vpmuludq 304(%rsp),%xmm15,%xmm4 ++ vpaddq %xmm4,%xmm10,%xmm10 ++ vpmuludq 368(%rsp),%xmm14,%xmm4 ++ vpaddq %xmm4,%xmm1,%xmm1 ++ vmovdqa 256(%rsp),%xmm4 ++ vpmuludq 400(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm7,%xmm7 ++ vpmuludq 256(%rsp),%xmm14,%xmm4 ++ vpaddq %xmm4,%xmm13,%xmm13 ++ vpmuludq 304(%rsp),%xmm14,%xmm4 ++ vpaddq %xmm4,%xmm0,%xmm0 ++ vpmuludq 352(%rsp),%xmm15,%xmm4 ++ vpaddq %xmm4,%xmm11,%xmm11 ++ vpmuludq 336(%rsp),%xmm15,%xmm4 ++ vpaddq %xmm4,%xmm12,%xmm12 ++ vmovdqa 304(%rsp),%xmm4 ++ vpaddq %xmm4,%xmm4,%xmm4 ++ vpmuludq 400(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm9,%xmm9 ++ vpmuludq 320(%rsp),%xmm2,%xmm4 ++ vpaddq %xmm4,%xmm1,%xmm1 ++ vmovdqa 256(%rsp),%xmm4 ++ vpmuludq 320(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm3,%xmm3 ++ vmovdqa 256(%rsp),%xmm4 ++ vpmuludq 368(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm5,%xmm5 ++ vmovdqa 304(%rsp),%xmm4 ++ vpmuludq 368(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm7,%xmm7 ++ vmovdqa 352(%rsp),%xmm4 ++ vpmuludq 400(%rsp),%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm10,%xmm10 ++ vpmuludq 224(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm0,%xmm0 ++ vmovdqa 272(%rsp),%xmm2 ++ vpmuludq 224(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm1,%xmm1 ++ vmovdqa 320(%rsp),%xmm2 ++ vpmuludq 304(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vmovdqa 368(%rsp),%xmm2 ++ vpmuludq 288(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm9,%xmm9 ++ vmovdqa 400(%rsp),%xmm2 ++ vpmuludq 336(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpsrlq $26,%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm3,%xmm3 ++ vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 ++ vpsrlq $25,%xmm10,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 ++ vpsrlq $25,%xmm3,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpand curve25519_sandy2x_m25(%rip),%xmm3,%xmm3 ++ vpsrlq $26,%xmm11,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 ++ vpsrlq $26,%xmm5,%xmm2 ++ vpaddq %xmm2,%xmm7,%xmm7 ++ vpand curve25519_sandy2x_m26(%rip),%xmm5,%xmm5 ++ vpsrlq $25,%xmm12,%xmm2 ++ vpaddq %xmm2,%xmm13,%xmm13 ++ vpand curve25519_sandy2x_m25(%rip),%xmm12,%xmm12 ++ vpsrlq $25,%xmm7,%xmm2 ++ vpaddq %xmm2,%xmm9,%xmm9 ++ vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 ++ vpsrlq $26,%xmm13,%xmm2 ++ vpaddq %xmm2,%xmm0,%xmm0 ++ vpand curve25519_sandy2x_m26(%rip),%xmm13,%xmm13 ++ vpsrlq $26,%xmm9,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 ++ vpsrlq $25,%xmm0,%xmm2 ++ vpsllq $4,%xmm2,%xmm4 ++ vpaddq %xmm2,%xmm1,%xmm1 ++ vpsllq $1,%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm4,%xmm4 ++ vpaddq %xmm4,%xmm1,%xmm1 ++ vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 ++ vpsrlq $25,%xmm10,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 ++ vpsrlq $26,%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm3,%xmm3 ++ vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 ++ vpunpckhqdq %xmm3,%xmm1,%xmm2 ++ vpunpcklqdq %xmm3,%xmm1,%xmm1 ++ vmovdqa %xmm1,176(%rsp) ++ vpaddq curve25519_sandy2x_subc0(%rip),%xmm2,%xmm3 ++ vpsubq %xmm1,%xmm3,%xmm3 ++ vpunpckhqdq %xmm3,%xmm2,%xmm1 ++ vpunpcklqdq %xmm3,%xmm2,%xmm2 ++ vmovdqa %xmm2,192(%rsp) ++ vmovdqa %xmm1,224(%rsp) ++ vpsllq $1,%xmm1,%xmm1 ++ vmovdqa %xmm1,240(%rsp) ++ vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm3,%xmm3 ++ vmovdqa 80(%rsp),%xmm1 ++ vpunpcklqdq %xmm1,%xmm3,%xmm2 ++ vpunpckhqdq %xmm1,%xmm3,%xmm1 ++ vpunpckhqdq %xmm7,%xmm5,%xmm3 ++ vpunpcklqdq %xmm7,%xmm5,%xmm4 ++ vmovdqa %xmm4,256(%rsp) ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm5 ++ vpsubq %xmm4,%xmm5,%xmm5 ++ vpunpckhqdq %xmm5,%xmm3,%xmm4 ++ vpunpcklqdq %xmm5,%xmm3,%xmm3 ++ vmovdqa %xmm3,272(%rsp) ++ vmovdqa %xmm4,288(%rsp) ++ vpsllq $1,%xmm4,%xmm4 ++ vmovdqa %xmm4,304(%rsp) ++ vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm5,%xmm5 ++ vmovdqa 96(%rsp),%xmm3 ++ vpunpcklqdq %xmm3,%xmm5,%xmm4 ++ vpunpckhqdq %xmm3,%xmm5,%xmm3 ++ vpunpckhqdq %xmm10,%xmm9,%xmm5 ++ vpunpcklqdq %xmm10,%xmm9,%xmm6 ++ vmovdqa %xmm6,320(%rsp) ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm5,%xmm7 ++ vpsubq %xmm6,%xmm7,%xmm7 ++ vpunpckhqdq %xmm7,%xmm5,%xmm6 ++ vpunpcklqdq %xmm7,%xmm5,%xmm5 ++ vmovdqa %xmm5,336(%rsp) ++ vmovdqa %xmm6,352(%rsp) ++ vpsllq $1,%xmm6,%xmm6 ++ vmovdqa %xmm6,368(%rsp) ++ vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm7,%xmm7 ++ vmovdqa 112(%rsp),%xmm5 ++ vpunpcklqdq %xmm5,%xmm7,%xmm6 ++ vpunpckhqdq %xmm5,%xmm7,%xmm5 ++ vpunpckhqdq %xmm12,%xmm11,%xmm7 ++ vpunpcklqdq %xmm12,%xmm11,%xmm8 ++ vmovdqa %xmm8,384(%rsp) ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm7,%xmm9 ++ vpsubq %xmm8,%xmm9,%xmm9 ++ vpunpckhqdq %xmm9,%xmm7,%xmm8 ++ vpunpcklqdq %xmm9,%xmm7,%xmm7 ++ vmovdqa %xmm7,400(%rsp) ++ vmovdqa %xmm8,416(%rsp) ++ vpsllq $1,%xmm8,%xmm8 ++ vmovdqa %xmm8,432(%rsp) ++ vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm9,%xmm9 ++ vmovdqa 160(%rsp),%xmm7 ++ vpunpcklqdq %xmm7,%xmm9,%xmm8 ++ vpunpckhqdq %xmm7,%xmm9,%xmm7 ++ vpunpckhqdq %xmm0,%xmm13,%xmm9 ++ vpunpcklqdq %xmm0,%xmm13,%xmm0 ++ vmovdqa %xmm0,160(%rsp) ++ vpaddq curve25519_sandy2x_subc2(%rip),%xmm9,%xmm10 ++ vpsubq %xmm0,%xmm10,%xmm10 ++ vpunpckhqdq %xmm10,%xmm9,%xmm0 ++ vpunpcklqdq %xmm10,%xmm9,%xmm9 ++ vmovdqa %xmm9,448(%rsp) ++ vmovdqa %xmm0,464(%rsp) ++ vpsllq $1,%xmm0,%xmm0 ++ vmovdqa %xmm0,480(%rsp) ++ vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm10,%xmm10 ++ vmovdqa 208(%rsp),%xmm0 ++ vpunpcklqdq %xmm0,%xmm10,%xmm9 ++ vpunpckhqdq %xmm0,%xmm10,%xmm0 ++ vpsrlq $26,%xmm2,%xmm10 ++ vpaddq %xmm10,%xmm1,%xmm1 ++ vpand curve25519_sandy2x_m26(%rip),%xmm2,%xmm2 ++ vpsrlq $25,%xmm5,%xmm10 ++ vpaddq %xmm10,%xmm8,%xmm8 ++ vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 ++ vpsrlq $25,%xmm1,%xmm10 ++ vpaddq %xmm10,%xmm4,%xmm4 ++ vpand curve25519_sandy2x_m25(%rip),%xmm1,%xmm1 ++ vpsrlq $26,%xmm8,%xmm10 ++ vpaddq %xmm10,%xmm7,%xmm7 ++ vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 ++ vpsrlq $26,%xmm4,%xmm10 ++ vpaddq %xmm10,%xmm3,%xmm3 ++ vpand curve25519_sandy2x_m26(%rip),%xmm4,%xmm4 ++ vpsrlq $25,%xmm7,%xmm10 ++ vpaddq %xmm10,%xmm9,%xmm9 ++ vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 ++ vpsrlq $25,%xmm3,%xmm10 ++ vpaddq %xmm10,%xmm6,%xmm6 ++ vpand curve25519_sandy2x_m25(%rip),%xmm3,%xmm3 ++ vpsrlq $26,%xmm9,%xmm10 ++ vpaddq %xmm10,%xmm0,%xmm0 ++ vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 ++ vpsrlq $26,%xmm6,%xmm10 ++ vpaddq %xmm10,%xmm5,%xmm5 ++ vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 ++ vpsrlq $25,%xmm0,%xmm10 ++ vpsllq $4,%xmm10,%xmm11 ++ vpaddq %xmm10,%xmm2,%xmm2 ++ vpsllq $1,%xmm10,%xmm10 ++ vpaddq %xmm10,%xmm11,%xmm11 ++ vpaddq %xmm11,%xmm2,%xmm2 ++ vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 ++ vpsrlq $25,%xmm5,%xmm10 ++ vpaddq %xmm10,%xmm8,%xmm8 ++ vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 ++ vpsrlq $26,%xmm2,%xmm10 ++ vpaddq %xmm10,%xmm1,%xmm1 ++ vpand curve25519_sandy2x_m26(%rip),%xmm2,%xmm2 ++ vpunpckhqdq %xmm1,%xmm2,%xmm10 ++ vmovdqa %xmm10,80(%rsp) ++ vpunpcklqdq %xmm1,%xmm2,%xmm1 ++ vpunpckhqdq %xmm3,%xmm4,%xmm2 ++ vmovdqa %xmm2,96(%rsp) ++ vpunpcklqdq %xmm3,%xmm4,%xmm2 ++ vpunpckhqdq %xmm5,%xmm6,%xmm3 ++ vmovdqa %xmm3,112(%rsp) ++ vpunpcklqdq %xmm5,%xmm6,%xmm3 ++ vpunpckhqdq %xmm7,%xmm8,%xmm4 ++ vmovdqa %xmm4,128(%rsp) ++ vpunpcklqdq %xmm7,%xmm8,%xmm4 ++ vpunpckhqdq %xmm0,%xmm9,%xmm5 ++ vmovdqa %xmm5,144(%rsp) ++ vpunpcklqdq %xmm0,%xmm9,%xmm0 ++ vmovdqa 176(%rsp),%xmm5 ++ vpaddq %xmm5,%xmm1,%xmm1 ++ vpunpcklqdq %xmm1,%xmm5,%xmm6 ++ vpunpckhqdq %xmm1,%xmm5,%xmm1 ++ vpmuludq 224(%rsp),%xmm6,%xmm5 ++ vpmuludq 192(%rsp),%xmm1,%xmm7 ++ vpaddq %xmm7,%xmm5,%xmm5 ++ vpmuludq 272(%rsp),%xmm6,%xmm7 ++ vpmuludq 240(%rsp),%xmm1,%xmm8 ++ vpaddq %xmm8,%xmm7,%xmm7 ++ vpmuludq 288(%rsp),%xmm6,%xmm8 ++ vpmuludq 272(%rsp),%xmm1,%xmm9 ++ vpaddq %xmm9,%xmm8,%xmm8 ++ vpmuludq 336(%rsp),%xmm6,%xmm9 ++ vpmuludq 304(%rsp),%xmm1,%xmm10 ++ vpaddq %xmm10,%xmm9,%xmm9 ++ vpmuludq 352(%rsp),%xmm6,%xmm10 ++ vpmuludq 336(%rsp),%xmm1,%xmm11 ++ vpaddq %xmm11,%xmm10,%xmm10 ++ vpmuludq 400(%rsp),%xmm6,%xmm11 ++ vpmuludq 368(%rsp),%xmm1,%xmm12 ++ vpaddq %xmm12,%xmm11,%xmm11 ++ vpmuludq 416(%rsp),%xmm6,%xmm12 ++ vpmuludq 400(%rsp),%xmm1,%xmm13 ++ vpaddq %xmm13,%xmm12,%xmm12 ++ vpmuludq 448(%rsp),%xmm6,%xmm13 ++ vpmuludq 432(%rsp),%xmm1,%xmm14 ++ vpaddq %xmm14,%xmm13,%xmm13 ++ vpmuludq 464(%rsp),%xmm6,%xmm14 ++ vpmuludq 448(%rsp),%xmm1,%xmm15 ++ vpaddq %xmm15,%xmm14,%xmm14 ++ vpmuludq 192(%rsp),%xmm6,%xmm6 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 ++ vpmuludq 480(%rsp),%xmm1,%xmm1 ++ vpaddq %xmm1,%xmm6,%xmm6 ++ vmovdqa 256(%rsp),%xmm1 ++ vpaddq %xmm1,%xmm2,%xmm2 ++ vpunpcklqdq %xmm2,%xmm1,%xmm15 ++ vpunpckhqdq %xmm2,%xmm1,%xmm1 ++ vpmuludq 192(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm7,%xmm7 ++ vpmuludq 224(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm8,%xmm8 ++ vpmuludq 272(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm9,%xmm9 ++ vpmuludq 288(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpmuludq 336(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpmuludq 352(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpmuludq 400(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm13,%xmm13 ++ vpmuludq 416(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm14,%xmm14 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm15,%xmm15 ++ vpmuludq 448(%rsp),%xmm15,%xmm2 ++ vpaddq %xmm2,%xmm6,%xmm6 ++ vpmuludq 464(%rsp),%xmm15,%xmm15 ++ vpaddq %xmm15,%xmm5,%xmm5 ++ vpmuludq 192(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm8,%xmm8 ++ vpmuludq 240(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm9,%xmm9 ++ vpmuludq 272(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpmuludq 304(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpmuludq 336(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpmuludq 368(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm13,%xmm13 ++ vpmuludq 400(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm14,%xmm14 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 ++ vpmuludq 432(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm6,%xmm6 ++ vpmuludq 448(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpmuludq 480(%rsp),%xmm1,%xmm1 ++ vpaddq %xmm1,%xmm7,%xmm7 ++ vmovdqa 320(%rsp),%xmm1 ++ vpaddq %xmm1,%xmm3,%xmm3 ++ vpunpcklqdq %xmm3,%xmm1,%xmm2 ++ vpunpckhqdq %xmm3,%xmm1,%xmm1 ++ vpmuludq 192(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm9,%xmm9 ++ vpmuludq 224(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm10,%xmm10 ++ vpmuludq 272(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpmuludq 288(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm12,%xmm12 ++ vpmuludq 336(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpmuludq 352(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm14,%xmm14 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 ++ vpmuludq 400(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpmuludq 416(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm5,%xmm5 ++ vpmuludq 448(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpmuludq 464(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm8,%xmm8 ++ vpmuludq 192(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpmuludq 240(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm11,%xmm11 ++ vpmuludq 272(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpmuludq 304(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm13,%xmm13 ++ vpmuludq 336(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm14,%xmm14 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 ++ vpmuludq 368(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm6,%xmm6 ++ vpmuludq 400(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpmuludq 432(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm7,%xmm7 ++ vpmuludq 448(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm8,%xmm8 ++ vpmuludq 480(%rsp),%xmm1,%xmm1 ++ vpaddq %xmm1,%xmm9,%xmm9 ++ vmovdqa 384(%rsp),%xmm1 ++ vpaddq %xmm1,%xmm4,%xmm4 ++ vpunpcklqdq %xmm4,%xmm1,%xmm2 ++ vpunpckhqdq %xmm4,%xmm1,%xmm1 ++ vpmuludq 192(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm11,%xmm11 ++ vpmuludq 224(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm12,%xmm12 ++ vpmuludq 272(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm13,%xmm13 ++ vpmuludq 288(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm14,%xmm14 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 ++ vpmuludq 336(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm6,%xmm6 ++ vpmuludq 352(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm5,%xmm5 ++ vpmuludq 400(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm7,%xmm7 ++ vpmuludq 416(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm8,%xmm8 ++ vpmuludq 448(%rsp),%xmm2,%xmm3 ++ vpaddq %xmm3,%xmm9,%xmm9 ++ vpmuludq 464(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpmuludq 192(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpmuludq 240(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm13,%xmm13 ++ vpmuludq 272(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm14,%xmm14 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 ++ vpmuludq 304(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm6,%xmm6 ++ vpmuludq 336(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm5,%xmm5 ++ vpmuludq 368(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm7,%xmm7 ++ vpmuludq 400(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm8,%xmm8 ++ vpmuludq 432(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm9,%xmm9 ++ vpmuludq 448(%rsp),%xmm1,%xmm2 ++ vpaddq %xmm2,%xmm10,%xmm10 ++ vpmuludq 480(%rsp),%xmm1,%xmm1 ++ vpaddq %xmm1,%xmm11,%xmm11 ++ vmovdqa 160(%rsp),%xmm1 ++ vpaddq %xmm1,%xmm0,%xmm0 ++ vpunpcklqdq %xmm0,%xmm1,%xmm2 ++ vpunpckhqdq %xmm0,%xmm1,%xmm0 ++ vpmuludq 192(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm13,%xmm13 ++ vpmuludq 224(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm14,%xmm14 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 ++ vpmuludq 272(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm6,%xmm6 ++ vpmuludq 288(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm5,%xmm5 ++ vpmuludq 336(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm7,%xmm7 ++ vpmuludq 352(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm8,%xmm8 ++ vpmuludq 400(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm9,%xmm9 ++ vpmuludq 416(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm10,%xmm10 ++ vpmuludq 448(%rsp),%xmm2,%xmm1 ++ vpaddq %xmm1,%xmm11,%xmm11 ++ vpmuludq 464(%rsp),%xmm2,%xmm2 ++ vpaddq %xmm2,%xmm12,%xmm12 ++ vpmuludq 192(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm14,%xmm14 ++ vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm0,%xmm0 ++ vpmuludq 240(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm6,%xmm6 ++ vpmuludq 272(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm5,%xmm5 ++ vpmuludq 304(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm7,%xmm7 ++ vpmuludq 336(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm8,%xmm8 ++ vpmuludq 368(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm9,%xmm9 ++ vpmuludq 400(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm10,%xmm10 ++ vpmuludq 432(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm11,%xmm11 ++ vpmuludq 448(%rsp),%xmm0,%xmm1 ++ vpaddq %xmm1,%xmm12,%xmm12 ++ vpmuludq 480(%rsp),%xmm0,%xmm0 ++ vpaddq %xmm0,%xmm13,%xmm13 ++ vpsrlq $26,%xmm6,%xmm0 ++ vpaddq %xmm0,%xmm5,%xmm5 ++ vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 ++ vpsrlq $25,%xmm10,%xmm0 ++ vpaddq %xmm0,%xmm11,%xmm11 ++ vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 ++ vpsrlq $25,%xmm5,%xmm0 ++ vpaddq %xmm0,%xmm7,%xmm7 ++ vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 ++ vpsrlq $26,%xmm11,%xmm0 ++ vpaddq %xmm0,%xmm12,%xmm12 ++ vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 ++ vpsrlq $26,%xmm7,%xmm0 ++ vpaddq %xmm0,%xmm8,%xmm8 ++ vpand curve25519_sandy2x_m26(%rip),%xmm7,%xmm7 ++ vpsrlq $25,%xmm12,%xmm0 ++ vpaddq %xmm0,%xmm13,%xmm13 ++ vpand curve25519_sandy2x_m25(%rip),%xmm12,%xmm12 ++ vpsrlq $25,%xmm8,%xmm0 ++ vpaddq %xmm0,%xmm9,%xmm9 ++ vpand curve25519_sandy2x_m25(%rip),%xmm8,%xmm8 ++ vpsrlq $26,%xmm13,%xmm0 ++ vpaddq %xmm0,%xmm14,%xmm14 ++ vpand curve25519_sandy2x_m26(%rip),%xmm13,%xmm13 ++ vpsrlq $26,%xmm9,%xmm0 ++ vpaddq %xmm0,%xmm10,%xmm10 ++ vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 ++ vpsrlq $25,%xmm14,%xmm0 ++ vpsllq $4,%xmm0,%xmm1 ++ vpaddq %xmm0,%xmm6,%xmm6 ++ vpsllq $1,%xmm0,%xmm0 ++ vpaddq %xmm0,%xmm1,%xmm1 ++ vpaddq %xmm1,%xmm6,%xmm6 ++ vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 ++ vpsrlq $25,%xmm10,%xmm0 ++ vpaddq %xmm0,%xmm11,%xmm11 ++ vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 ++ vpsrlq $26,%xmm6,%xmm0 ++ vpaddq %xmm0,%xmm5,%xmm5 ++ vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 ++ vpunpckhqdq %xmm5,%xmm6,%xmm1 ++ vpunpcklqdq %xmm5,%xmm6,%xmm0 ++ vpunpckhqdq %xmm8,%xmm7,%xmm3 ++ vpunpcklqdq %xmm8,%xmm7,%xmm2 ++ vpunpckhqdq %xmm10,%xmm9,%xmm5 ++ vpunpcklqdq %xmm10,%xmm9,%xmm4 ++ vpunpckhqdq %xmm12,%xmm11,%xmm7 ++ vpunpcklqdq %xmm12,%xmm11,%xmm6 ++ vpunpckhqdq %xmm14,%xmm13,%xmm9 ++ vpunpcklqdq %xmm14,%xmm13,%xmm8 ++ cmp $0,%rdx ++ jne .Lladder_base_loop ++ vmovdqu %xmm1,80(%rdi) ++ vmovdqu %xmm0,0(%rdi) ++ vmovdqu %xmm3,96(%rdi) ++ vmovdqu %xmm2,16(%rdi) ++ vmovdqu %xmm5,112(%rdi) ++ vmovdqu %xmm4,32(%rdi) ++ vmovdqu %xmm7,128(%rdi) ++ vmovdqu %xmm6,48(%rdi) ++ vmovdqu %xmm9,144(%rdi) ++ vmovdqu %xmm8,64(%rdi) ++ movq 1536(%rsp),%r11 ++ movq 1544(%rsp),%r12 ++ movq 1552(%rsp),%r13 ++ add %r11,%rsp ++ ret ++ENDPROC(curve25519_sandy2x_ladder_base) +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/crypto/curve25519-neon-arm.S 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,2104 @@ ++/* ++ * Copyright (C) 2017 Jason A. Donenfeld . All Rights Reserved. ++ * Based on algorithms from Daniel J. Bernstein and Peter Schwabe. ++ */ ++ ++#include ++ ++ .text ++ .fpu neon ++ .align 4 ++ ++ENTRY(curve25519_asm_neon) ++ vpush {q4,q5,q6,q7} ++ mov r12,sp ++ sub sp,sp,#736 ++ and sp,sp,#0xffffffe0 ++ strd r4,[sp,#0] ++ strd r6,[sp,#8] ++ strd r8,[sp,#16] ++ strd r10,[sp,#24] ++ str r12,[sp,#480] ++ str r14,[sp,#484] ++ mov r0,r0 ++ mov r1,r1 ++ mov r2,r2 ++ add r3,sp,#32 ++ ldr r4,=0 ++ ldr r5,=254 ++ vmov.i32 q0,#1 ++ vshr.u64 q1,q0,#7 ++ vshr.u64 q0,q0,#8 ++ vmov.i32 d4,#19 ++ vmov.i32 d5,#38 ++ add r6,sp,#512 ++ vst1.8 {d2-d3},[r6,: 128] ++ add r6,sp,#528 ++ vst1.8 {d0-d1},[r6,: 128] ++ add r6,sp,#544 ++ vst1.8 {d4-d5},[r6,: 128] ++ add r6,r3,#0 ++ vmov.i32 q2,#0 ++ vst1.8 {d4-d5},[r6,: 128]! ++ vst1.8 {d4-d5},[r6,: 128]! ++ vst1.8 d4,[r6,: 64] ++ add r6,r3,#0 ++ ldr r7,=960 ++ sub r7,r7,#2 ++ neg r7,r7 ++ sub r7,r7,r7,LSL #7 ++ str r7,[r6] ++ add r6,sp,#704 ++ vld1.8 {d4-d5},[r1]! ++ vld1.8 {d6-d7},[r1] ++ vst1.8 {d4-d5},[r6,: 128]! ++ vst1.8 {d6-d7},[r6,: 128] ++ sub r1,r6,#16 ++ ldrb r6,[r1] ++ and r6,r6,#248 ++ strb r6,[r1] ++ ldrb r6,[r1,#31] ++ and r6,r6,#127 ++ orr r6,r6,#64 ++ strb r6,[r1,#31] ++ vmov.i64 q2,#0xffffffff ++ vshr.u64 q3,q2,#7 ++ vshr.u64 q2,q2,#6 ++ vld1.8 {d8},[r2] ++ vld1.8 {d10},[r2] ++ add r2,r2,#6 ++ vld1.8 {d12},[r2] ++ vld1.8 {d14},[r2] ++ add r2,r2,#6 ++ vld1.8 {d16},[r2] ++ add r2,r2,#4 ++ vld1.8 {d18},[r2] ++ vld1.8 {d20},[r2] ++ add r2,r2,#6 ++ vld1.8 {d22},[r2] ++ add r2,r2,#2 ++ vld1.8 {d24},[r2] ++ vld1.8 {d26},[r2] ++ vshr.u64 q5,q5,#26 ++ vshr.u64 q6,q6,#3 ++ vshr.u64 q7,q7,#29 ++ vshr.u64 q8,q8,#6 ++ vshr.u64 q10,q10,#25 ++ vshr.u64 q11,q11,#3 ++ vshr.u64 q12,q12,#12 ++ vshr.u64 q13,q13,#38 ++ vand q4,q4,q2 ++ vand q6,q6,q2 ++ vand q8,q8,q2 ++ vand q10,q10,q2 ++ vand q2,q12,q2 ++ vand q5,q5,q3 ++ vand q7,q7,q3 ++ vand q9,q9,q3 ++ vand q11,q11,q3 ++ vand q3,q13,q3 ++ add r2,r3,#48 ++ vadd.i64 q12,q4,q1 ++ vadd.i64 q13,q10,q1 ++ vshr.s64 q12,q12,#26 ++ vshr.s64 q13,q13,#26 ++ vadd.i64 q5,q5,q12 ++ vshl.i64 q12,q12,#26 ++ vadd.i64 q14,q5,q0 ++ vadd.i64 q11,q11,q13 ++ vshl.i64 q13,q13,#26 ++ vadd.i64 q15,q11,q0 ++ vsub.i64 q4,q4,q12 ++ vshr.s64 q12,q14,#25 ++ vsub.i64 q10,q10,q13 ++ vshr.s64 q13,q15,#25 ++ vadd.i64 q6,q6,q12 ++ vshl.i64 q12,q12,#25 ++ vadd.i64 q14,q6,q1 ++ vadd.i64 q2,q2,q13 ++ vsub.i64 q5,q5,q12 ++ vshr.s64 q12,q14,#26 ++ vshl.i64 q13,q13,#25 ++ vadd.i64 q14,q2,q1 ++ vadd.i64 q7,q7,q12 ++ vshl.i64 q12,q12,#26 ++ vadd.i64 q15,q7,q0 ++ vsub.i64 q11,q11,q13 ++ vshr.s64 q13,q14,#26 ++ vsub.i64 q6,q6,q12 ++ vshr.s64 q12,q15,#25 ++ vadd.i64 q3,q3,q13 ++ vshl.i64 q13,q13,#26 ++ vadd.i64 q14,q3,q0 ++ vadd.i64 q8,q8,q12 ++ vshl.i64 q12,q12,#25 ++ vadd.i64 q15,q8,q1 ++ add r2,r2,#8 ++ vsub.i64 q2,q2,q13 ++ vshr.s64 q13,q14,#25 ++ vsub.i64 q7,q7,q12 ++ vshr.s64 q12,q15,#26 ++ vadd.i64 q14,q13,q13 ++ vadd.i64 q9,q9,q12 ++ vtrn.32 d12,d14 ++ vshl.i64 q12,q12,#26 ++ vtrn.32 d13,d15 ++ vadd.i64 q0,q9,q0 ++ vadd.i64 q4,q4,q14 ++ vst1.8 d12,[r2,: 64]! ++ vshl.i64 q6,q13,#4 ++ vsub.i64 q7,q8,q12 ++ vshr.s64 q0,q0,#25 ++ vadd.i64 q4,q4,q6 ++ vadd.i64 q6,q10,q0 ++ vshl.i64 q0,q0,#25 ++ vadd.i64 q8,q6,q1 ++ vadd.i64 q4,q4,q13 ++ vshl.i64 q10,q13,#25 ++ vadd.i64 q1,q4,q1 ++ vsub.i64 q0,q9,q0 ++ vshr.s64 q8,q8,#26 ++ vsub.i64 q3,q3,q10 ++ vtrn.32 d14,d0 ++ vshr.s64 q1,q1,#26 ++ vtrn.32 d15,d1 ++ vadd.i64 q0,q11,q8 ++ vst1.8 d14,[r2,: 64] ++ vshl.i64 q7,q8,#26 ++ vadd.i64 q5,q5,q1 ++ vtrn.32 d4,d6 ++ vshl.i64 q1,q1,#26 ++ vtrn.32 d5,d7 ++ vsub.i64 q3,q6,q7 ++ add r2,r2,#16 ++ vsub.i64 q1,q4,q1 ++ vst1.8 d4,[r2,: 64] ++ vtrn.32 d6,d0 ++ vtrn.32 d7,d1 ++ sub r2,r2,#8 ++ vtrn.32 d2,d10 ++ vtrn.32 d3,d11 ++ vst1.8 d6,[r2,: 64] ++ sub r2,r2,#24 ++ vst1.8 d2,[r2,: 64] ++ add r2,r3,#96 ++ vmov.i32 q0,#0 ++ vmov.i64 d2,#0xff ++ vmov.i64 d3,#0 ++ vshr.u32 q1,q1,#7 ++ vst1.8 {d2-d3},[r2,: 128]! ++ vst1.8 {d0-d1},[r2,: 128]! ++ vst1.8 d0,[r2,: 64] ++ add r2,r3,#144 ++ vmov.i32 q0,#0 ++ vst1.8 {d0-d1},[r2,: 128]! ++ vst1.8 {d0-d1},[r2,: 128]! ++ vst1.8 d0,[r2,: 64] ++ add r2,r3,#240 ++ vmov.i32 q0,#0 ++ vmov.i64 d2,#0xff ++ vmov.i64 d3,#0 ++ vshr.u32 q1,q1,#7 ++ vst1.8 {d2-d3},[r2,: 128]! ++ vst1.8 {d0-d1},[r2,: 128]! ++ vst1.8 d0,[r2,: 64] ++ add r2,r3,#48 ++ add r6,r3,#192 ++ vld1.8 {d0-d1},[r2,: 128]! ++ vld1.8 {d2-d3},[r2,: 128]! ++ vld1.8 {d4},[r2,: 64] ++ vst1.8 {d0-d1},[r6,: 128]! ++ vst1.8 {d2-d3},[r6,: 128]! ++ vst1.8 d4,[r6,: 64] ++ .Lmainloop: ++ mov r2,r5,LSR #3 ++ and r6,r5,#7 ++ ldrb r2,[r1,r2] ++ mov r2,r2,LSR r6 ++ and r2,r2,#1 ++ str r5,[sp,#488] ++ eor r4,r4,r2 ++ str r2,[sp,#492] ++ neg r2,r4 ++ add r4,r3,#96 ++ add r5,r3,#192 ++ add r6,r3,#144 ++ vld1.8 {d8-d9},[r4,: 128]! ++ add r7,r3,#240 ++ vld1.8 {d10-d11},[r5,: 128]! ++ veor q6,q4,q5 ++ vld1.8 {d14-d15},[r6,: 128]! ++ vdup.i32 q8,r2 ++ vld1.8 {d18-d19},[r7,: 128]! ++ veor q10,q7,q9 ++ vld1.8 {d22-d23},[r4,: 128]! ++ vand q6,q6,q8 ++ vld1.8 {d24-d25},[r5,: 128]! ++ vand q10,q10,q8 ++ vld1.8 {d26-d27},[r6,: 128]! ++ veor q4,q4,q6 ++ vld1.8 {d28-d29},[r7,: 128]! ++ veor q5,q5,q6 ++ vld1.8 {d0},[r4,: 64] ++ veor q6,q7,q10 ++ vld1.8 {d2},[r5,: 64] ++ veor q7,q9,q10 ++ vld1.8 {d4},[r6,: 64] ++ veor q9,q11,q12 ++ vld1.8 {d6},[r7,: 64] ++ veor q10,q0,q1 ++ sub r2,r4,#32 ++ vand q9,q9,q8 ++ sub r4,r5,#32 ++ vand q10,q10,q8 ++ sub r5,r6,#32 ++ veor q11,q11,q9 ++ sub r6,r7,#32 ++ veor q0,q0,q10 ++ veor q9,q12,q9 ++ veor q1,q1,q10 ++ veor q10,q13,q14 ++ veor q12,q2,q3 ++ vand q10,q10,q8 ++ vand q8,q12,q8 ++ veor q12,q13,q10 ++ veor q2,q2,q8 ++ veor q10,q14,q10 ++ veor q3,q3,q8 ++ vadd.i32 q8,q4,q6 ++ vsub.i32 q4,q4,q6 ++ vst1.8 {d16-d17},[r2,: 128]! ++ vadd.i32 q6,q11,q12 ++ vst1.8 {d8-d9},[r5,: 128]! ++ vsub.i32 q4,q11,q12 ++ vst1.8 {d12-d13},[r2,: 128]! ++ vadd.i32 q6,q0,q2 ++ vst1.8 {d8-d9},[r5,: 128]! ++ vsub.i32 q0,q0,q2 ++ vst1.8 d12,[r2,: 64] ++ vadd.i32 q2,q5,q7 ++ vst1.8 d0,[r5,: 64] ++ vsub.i32 q0,q5,q7 ++ vst1.8 {d4-d5},[r4,: 128]! ++ vadd.i32 q2,q9,q10 ++ vst1.8 {d0-d1},[r6,: 128]! ++ vsub.i32 q0,q9,q10 ++ vst1.8 {d4-d5},[r4,: 128]! ++ vadd.i32 q2,q1,q3 ++ vst1.8 {d0-d1},[r6,: 128]! ++ vsub.i32 q0,q1,q3 ++ vst1.8 d4,[r4,: 64] ++ vst1.8 d0,[r6,: 64] ++ add r2,sp,#544 ++ add r4,r3,#96 ++ add r5,r3,#144 ++ vld1.8 {d0-d1},[r2,: 128] ++ vld1.8 {d2-d3},[r4,: 128]! ++ vld1.8 {d4-d5},[r5,: 128]! ++ vzip.i32 q1,q2 ++ vld1.8 {d6-d7},[r4,: 128]! ++ vld1.8 {d8-d9},[r5,: 128]! ++ vshl.i32 q5,q1,#1 ++ vzip.i32 q3,q4 ++ vshl.i32 q6,q2,#1 ++ vld1.8 {d14},[r4,: 64] ++ vshl.i32 q8,q3,#1 ++ vld1.8 {d15},[r5,: 64] ++ vshl.i32 q9,q4,#1 ++ vmul.i32 d21,d7,d1 ++ vtrn.32 d14,d15 ++ vmul.i32 q11,q4,q0 ++ vmul.i32 q0,q7,q0 ++ vmull.s32 q12,d2,d2 ++ vmlal.s32 q12,d11,d1 ++ vmlal.s32 q12,d12,d0 ++ vmlal.s32 q12,d13,d23 ++ vmlal.s32 q12,d16,d22 ++ vmlal.s32 q12,d7,d21 ++ vmull.s32 q10,d2,d11 ++ vmlal.s32 q10,d4,d1 ++ vmlal.s32 q10,d13,d0 ++ vmlal.s32 q10,d6,d23 ++ vmlal.s32 q10,d17,d22 ++ vmull.s32 q13,d10,d4 ++ vmlal.s32 q13,d11,d3 ++ vmlal.s32 q13,d13,d1 ++ vmlal.s32 q13,d16,d0 ++ vmlal.s32 q13,d17,d23 ++ vmlal.s32 q13,d8,d22 ++ vmull.s32 q1,d10,d5 ++ vmlal.s32 q1,d11,d4 ++ vmlal.s32 q1,d6,d1 ++ vmlal.s32 q1,d17,d0 ++ vmlal.s32 q1,d8,d23 ++ vmull.s32 q14,d10,d6 ++ vmlal.s32 q14,d11,d13 ++ vmlal.s32 q14,d4,d4 ++ vmlal.s32 q14,d17,d1 ++ vmlal.s32 q14,d18,d0 ++ vmlal.s32 q14,d9,d23 ++ vmull.s32 q11,d10,d7 ++ vmlal.s32 q11,d11,d6 ++ vmlal.s32 q11,d12,d5 ++ vmlal.s32 q11,d8,d1 ++ vmlal.s32 q11,d19,d0 ++ vmull.s32 q15,d10,d8 ++ vmlal.s32 q15,d11,d17 ++ vmlal.s32 q15,d12,d6 ++ vmlal.s32 q15,d13,d5 ++ vmlal.s32 q15,d19,d1 ++ vmlal.s32 q15,d14,d0 ++ vmull.s32 q2,d10,d9 ++ vmlal.s32 q2,d11,d8 ++ vmlal.s32 q2,d12,d7 ++ vmlal.s32 q2,d13,d6 ++ vmlal.s32 q2,d14,d1 ++ vmull.s32 q0,d15,d1 ++ vmlal.s32 q0,d10,d14 ++ vmlal.s32 q0,d11,d19 ++ vmlal.s32 q0,d12,d8 ++ vmlal.s32 q0,d13,d17 ++ vmlal.s32 q0,d6,d6 ++ add r2,sp,#512 ++ vld1.8 {d18-d19},[r2,: 128] ++ vmull.s32 q3,d16,d7 ++ vmlal.s32 q3,d10,d15 ++ vmlal.s32 q3,d11,d14 ++ vmlal.s32 q3,d12,d9 ++ vmlal.s32 q3,d13,d8 ++ add r2,sp,#528 ++ vld1.8 {d8-d9},[r2,: 128] ++ vadd.i64 q5,q12,q9 ++ vadd.i64 q6,q15,q9 ++ vshr.s64 q5,q5,#26 ++ vshr.s64 q6,q6,#26 ++ vadd.i64 q7,q10,q5 ++ vshl.i64 q5,q5,#26 ++ vadd.i64 q8,q7,q4 ++ vadd.i64 q2,q2,q6 ++ vshl.i64 q6,q6,#26 ++ vadd.i64 q10,q2,q4 ++ vsub.i64 q5,q12,q5 ++ vshr.s64 q8,q8,#25 ++ vsub.i64 q6,q15,q6 ++ vshr.s64 q10,q10,#25 ++ vadd.i64 q12,q13,q8 ++ vshl.i64 q8,q8,#25 ++ vadd.i64 q13,q12,q9 ++ vadd.i64 q0,q0,q10 ++ vsub.i64 q7,q7,q8 ++ vshr.s64 q8,q13,#26 ++ vshl.i64 q10,q10,#25 ++ vadd.i64 q13,q0,q9 ++ vadd.i64 q1,q1,q8 ++ vshl.i64 q8,q8,#26 ++ vadd.i64 q15,q1,q4 ++ vsub.i64 q2,q2,q10 ++ vshr.s64 q10,q13,#26 ++ vsub.i64 q8,q12,q8 ++ vshr.s64 q12,q15,#25 ++ vadd.i64 q3,q3,q10 ++ vshl.i64 q10,q10,#26 ++ vadd.i64 q13,q3,q4 ++ vadd.i64 q14,q14,q12 ++ add r2,r3,#288 ++ vshl.i64 q12,q12,#25 ++ add r4,r3,#336 ++ vadd.i64 q15,q14,q9 ++ add r2,r2,#8 ++ vsub.i64 q0,q0,q10 ++ add r4,r4,#8 ++ vshr.s64 q10,q13,#25 ++ vsub.i64 q1,q1,q12 ++ vshr.s64 q12,q15,#26 ++ vadd.i64 q13,q10,q10 ++ vadd.i64 q11,q11,q12 ++ vtrn.32 d16,d2 ++ vshl.i64 q12,q12,#26 ++ vtrn.32 d17,d3 ++ vadd.i64 q1,q11,q4 ++ vadd.i64 q4,q5,q13 ++ vst1.8 d16,[r2,: 64]! ++ vshl.i64 q5,q10,#4 ++ vst1.8 d17,[r4,: 64]! ++ vsub.i64 q8,q14,q12 ++ vshr.s64 q1,q1,#25 ++ vadd.i64 q4,q4,q5 ++ vadd.i64 q5,q6,q1 ++ vshl.i64 q1,q1,#25 ++ vadd.i64 q6,q5,q9 ++ vadd.i64 q4,q4,q10 ++ vshl.i64 q10,q10,#25 ++ vadd.i64 q9,q4,q9 ++ vsub.i64 q1,q11,q1 ++ vshr.s64 q6,q6,#26 ++ vsub.i64 q3,q3,q10 ++ vtrn.32 d16,d2 ++ vshr.s64 q9,q9,#26 ++ vtrn.32 d17,d3 ++ vadd.i64 q1,q2,q6 ++ vst1.8 d16,[r2,: 64] ++ vshl.i64 q2,q6,#26 ++ vst1.8 d17,[r4,: 64] ++ vadd.i64 q6,q7,q9 ++ vtrn.32 d0,d6 ++ vshl.i64 q7,q9,#26 ++ vtrn.32 d1,d7 ++ vsub.i64 q2,q5,q2 ++ add r2,r2,#16 ++ vsub.i64 q3,q4,q7 ++ vst1.8 d0,[r2,: 64] ++ add r4,r4,#16 ++ vst1.8 d1,[r4,: 64] ++ vtrn.32 d4,d2 ++ vtrn.32 d5,d3 ++ sub r2,r2,#8 ++ sub r4,r4,#8 ++ vtrn.32 d6,d12 ++ vtrn.32 d7,d13 ++ vst1.8 d4,[r2,: 64] ++ vst1.8 d5,[r4,: 64] ++ sub r2,r2,#24 ++ sub r4,r4,#24 ++ vst1.8 d6,[r2,: 64] ++ vst1.8 d7,[r4,: 64] ++ add r2,r3,#240 ++ add r4,r3,#96 ++ vld1.8 {d0-d1},[r4,: 128]! ++ vld1.8 {d2-d3},[r4,: 128]! ++ vld1.8 {d4},[r4,: 64] ++ add r4,r3,#144 ++ vld1.8 {d6-d7},[r4,: 128]! ++ vtrn.32 q0,q3 ++ vld1.8 {d8-d9},[r4,: 128]! ++ vshl.i32 q5,q0,#4 ++ vtrn.32 q1,q4 ++ vshl.i32 q6,q3,#4 ++ vadd.i32 q5,q5,q0 ++ vadd.i32 q6,q6,q3 ++ vshl.i32 q7,q1,#4 ++ vld1.8 {d5},[r4,: 64] ++ vshl.i32 q8,q4,#4 ++ vtrn.32 d4,d5 ++ vadd.i32 q7,q7,q1 ++ vadd.i32 q8,q8,q4 ++ vld1.8 {d18-d19},[r2,: 128]! ++ vshl.i32 q10,q2,#4 ++ vld1.8 {d22-d23},[r2,: 128]! ++ vadd.i32 q10,q10,q2 ++ vld1.8 {d24},[r2,: 64] ++ vadd.i32 q5,q5,q0 ++ add r2,r3,#192 ++ vld1.8 {d26-d27},[r2,: 128]! ++ vadd.i32 q6,q6,q3 ++ vld1.8 {d28-d29},[r2,: 128]! ++ vadd.i32 q8,q8,q4 ++ vld1.8 {d25},[r2,: 64] ++ vadd.i32 q10,q10,q2 ++ vtrn.32 q9,q13 ++ vadd.i32 q7,q7,q1 ++ vadd.i32 q5,q5,q0 ++ vtrn.32 q11,q14 ++ vadd.i32 q6,q6,q3 ++ add r2,sp,#560 ++ vadd.i32 q10,q10,q2 ++ vtrn.32 d24,d25 ++ vst1.8 {d12-d13},[r2,: 128] ++ vshl.i32 q6,q13,#1 ++ add r2,sp,#576 ++ vst1.8 {d20-d21},[r2,: 128] ++ vshl.i32 q10,q14,#1 ++ add r2,sp,#592 ++ vst1.8 {d12-d13},[r2,: 128] ++ vshl.i32 q15,q12,#1 ++ vadd.i32 q8,q8,q4 ++ vext.32 d10,d31,d30,#0 ++ vadd.i32 q7,q7,q1 ++ add r2,sp,#608 ++ vst1.8 {d16-d17},[r2,: 128] ++ vmull.s32 q8,d18,d5 ++ vmlal.s32 q8,d26,d4 ++ vmlal.s32 q8,d19,d9 ++ vmlal.s32 q8,d27,d3 ++ vmlal.s32 q8,d22,d8 ++ vmlal.s32 q8,d28,d2 ++ vmlal.s32 q8,d23,d7 ++ vmlal.s32 q8,d29,d1 ++ vmlal.s32 q8,d24,d6 ++ vmlal.s32 q8,d25,d0 ++ add r2,sp,#624 ++ vst1.8 {d14-d15},[r2,: 128] ++ vmull.s32 q2,d18,d4 ++ vmlal.s32 q2,d12,d9 ++ vmlal.s32 q2,d13,d8 ++ vmlal.s32 q2,d19,d3 ++ vmlal.s32 q2,d22,d2 ++ vmlal.s32 q2,d23,d1 ++ vmlal.s32 q2,d24,d0 ++ add r2,sp,#640 ++ vst1.8 {d20-d21},[r2,: 128] ++ vmull.s32 q7,d18,d9 ++ vmlal.s32 q7,d26,d3 ++ vmlal.s32 q7,d19,d8 ++ vmlal.s32 q7,d27,d2 ++ vmlal.s32 q7,d22,d7 ++ vmlal.s32 q7,d28,d1 ++ vmlal.s32 q7,d23,d6 ++ vmlal.s32 q7,d29,d0 ++ add r2,sp,#656 ++ vst1.8 {d10-d11},[r2,: 128] ++ vmull.s32 q5,d18,d3 ++ vmlal.s32 q5,d19,d2 ++ vmlal.s32 q5,d22,d1 ++ vmlal.s32 q5,d23,d0 ++ vmlal.s32 q5,d12,d8 ++ add r2,sp,#672 ++ vst1.8 {d16-d17},[r2,: 128] ++ vmull.s32 q4,d18,d8 ++ vmlal.s32 q4,d26,d2 ++ vmlal.s32 q4,d19,d7 ++ vmlal.s32 q4,d27,d1 ++ vmlal.s32 q4,d22,d6 ++ vmlal.s32 q4,d28,d0 ++ vmull.s32 q8,d18,d7 ++ vmlal.s32 q8,d26,d1 ++ vmlal.s32 q8,d19,d6 ++ vmlal.s32 q8,d27,d0 ++ add r2,sp,#576 ++ vld1.8 {d20-d21},[r2,: 128] ++ vmlal.s32 q7,d24,d21 ++ vmlal.s32 q7,d25,d20 ++ vmlal.s32 q4,d23,d21 ++ vmlal.s32 q4,d29,d20 ++ vmlal.s32 q8,d22,d21 ++ vmlal.s32 q8,d28,d20 ++ vmlal.s32 q5,d24,d20 ++ add r2,sp,#576 ++ vst1.8 {d14-d15},[r2,: 128] ++ vmull.s32 q7,d18,d6 ++ vmlal.s32 q7,d26,d0 ++ add r2,sp,#656 ++ vld1.8 {d30-d31},[r2,: 128] ++ vmlal.s32 q2,d30,d21 ++ vmlal.s32 q7,d19,d21 ++ vmlal.s32 q7,d27,d20 ++ add r2,sp,#624 ++ vld1.8 {d26-d27},[r2,: 128] ++ vmlal.s32 q4,d25,d27 ++ vmlal.s32 q8,d29,d27 ++ vmlal.s32 q8,d25,d26 ++ vmlal.s32 q7,d28,d27 ++ vmlal.s32 q7,d29,d26 ++ add r2,sp,#608 ++ vld1.8 {d28-d29},[r2,: 128] ++ vmlal.s32 q4,d24,d29 ++ vmlal.s32 q8,d23,d29 ++ vmlal.s32 q8,d24,d28 ++ vmlal.s32 q7,d22,d29 ++ vmlal.s32 q7,d23,d28 ++ add r2,sp,#608 ++ vst1.8 {d8-d9},[r2,: 128] ++ add r2,sp,#560 ++ vld1.8 {d8-d9},[r2,: 128] ++ vmlal.s32 q7,d24,d9 ++ vmlal.s32 q7,d25,d31 ++ vmull.s32 q1,d18,d2 ++ vmlal.s32 q1,d19,d1 ++ vmlal.s32 q1,d22,d0 ++ vmlal.s32 q1,d24,d27 ++ vmlal.s32 q1,d23,d20 ++ vmlal.s32 q1,d12,d7 ++ vmlal.s32 q1,d13,d6 ++ vmull.s32 q6,d18,d1 ++ vmlal.s32 q6,d19,d0 ++ vmlal.s32 q6,d23,d27 ++ vmlal.s32 q6,d22,d20 ++ vmlal.s32 q6,d24,d26 ++ vmull.s32 q0,d18,d0 ++ vmlal.s32 q0,d22,d27 ++ vmlal.s32 q0,d23,d26 ++ vmlal.s32 q0,d24,d31 ++ vmlal.s32 q0,d19,d20 ++ add r2,sp,#640 ++ vld1.8 {d18-d19},[r2,: 128] ++ vmlal.s32 q2,d18,d7 ++ vmlal.s32 q2,d19,d6 ++ vmlal.s32 q5,d18,d6 ++ vmlal.s32 q5,d19,d21 ++ vmlal.s32 q1,d18,d21 ++ vmlal.s32 q1,d19,d29 ++ vmlal.s32 q0,d18,d28 ++ vmlal.s32 q0,d19,d9 ++ vmlal.s32 q6,d18,d29 ++ vmlal.s32 q6,d19,d28 ++ add r2,sp,#592 ++ vld1.8 {d18-d19},[r2,: 128] ++ add r2,sp,#512 ++ vld1.8 {d22-d23},[r2,: 128] ++ vmlal.s32 q5,d19,d7 ++ vmlal.s32 q0,d18,d21 ++ vmlal.s32 q0,d19,d29 ++ vmlal.s32 q6,d18,d6 ++ add r2,sp,#528 ++ vld1.8 {d6-d7},[r2,: 128] ++ vmlal.s32 q6,d19,d21 ++ add r2,sp,#576 ++ vld1.8 {d18-d19},[r2,: 128] ++ vmlal.s32 q0,d30,d8 ++ add r2,sp,#672 ++ vld1.8 {d20-d21},[r2,: 128] ++ vmlal.s32 q5,d30,d29 ++ add r2,sp,#608 ++ vld1.8 {d24-d25},[r2,: 128] ++ vmlal.s32 q1,d30,d28 ++ vadd.i64 q13,q0,q11 ++ vadd.i64 q14,q5,q11 ++ vmlal.s32 q6,d30,d9 ++ vshr.s64 q4,q13,#26 ++ vshr.s64 q13,q14,#26 ++ vadd.i64 q7,q7,q4 ++ vshl.i64 q4,q4,#26 ++ vadd.i64 q14,q7,q3 ++ vadd.i64 q9,q9,q13 ++ vshl.i64 q13,q13,#26 ++ vadd.i64 q15,q9,q3 ++ vsub.i64 q0,q0,q4 ++ vshr.s64 q4,q14,#25 ++ vsub.i64 q5,q5,q13 ++ vshr.s64 q13,q15,#25 ++ vadd.i64 q6,q6,q4 ++ vshl.i64 q4,q4,#25 ++ vadd.i64 q14,q6,q11 ++ vadd.i64 q2,q2,q13 ++ vsub.i64 q4,q7,q4 ++ vshr.s64 q7,q14,#26 ++ vshl.i64 q13,q13,#25 ++ vadd.i64 q14,q2,q11 ++ vadd.i64 q8,q8,q7 ++ vshl.i64 q7,q7,#26 ++ vadd.i64 q15,q8,q3 ++ vsub.i64 q9,q9,q13 ++ vshr.s64 q13,q14,#26 ++ vsub.i64 q6,q6,q7 ++ vshr.s64 q7,q15,#25 ++ vadd.i64 q10,q10,q13 ++ vshl.i64 q13,q13,#26 ++ vadd.i64 q14,q10,q3 ++ vadd.i64 q1,q1,q7 ++ add r2,r3,#144 ++ vshl.i64 q7,q7,#25 ++ add r4,r3,#96 ++ vadd.i64 q15,q1,q11 ++ add r2,r2,#8 ++ vsub.i64 q2,q2,q13 ++ add r4,r4,#8 ++ vshr.s64 q13,q14,#25 ++ vsub.i64 q7,q8,q7 ++ vshr.s64 q8,q15,#26 ++ vadd.i64 q14,q13,q13 ++ vadd.i64 q12,q12,q8 ++ vtrn.32 d12,d14 ++ vshl.i64 q8,q8,#26 ++ vtrn.32 d13,d15 ++ vadd.i64 q3,q12,q3 ++ vadd.i64 q0,q0,q14 ++ vst1.8 d12,[r2,: 64]! ++ vshl.i64 q7,q13,#4 ++ vst1.8 d13,[r4,: 64]! ++ vsub.i64 q1,q1,q8 ++ vshr.s64 q3,q3,#25 ++ vadd.i64 q0,q0,q7 ++ vadd.i64 q5,q5,q3 ++ vshl.i64 q3,q3,#25 ++ vadd.i64 q6,q5,q11 ++ vadd.i64 q0,q0,q13 ++ vshl.i64 q7,q13,#25 ++ vadd.i64 q8,q0,q11 ++ vsub.i64 q3,q12,q3 ++ vshr.s64 q6,q6,#26 ++ vsub.i64 q7,q10,q7 ++ vtrn.32 d2,d6 ++ vshr.s64 q8,q8,#26 ++ vtrn.32 d3,d7 ++ vadd.i64 q3,q9,q6 ++ vst1.8 d2,[r2,: 64] ++ vshl.i64 q6,q6,#26 ++ vst1.8 d3,[r4,: 64] ++ vadd.i64 q1,q4,q8 ++ vtrn.32 d4,d14 ++ vshl.i64 q4,q8,#26 ++ vtrn.32 d5,d15 ++ vsub.i64 q5,q5,q6 ++ add r2,r2,#16 ++ vsub.i64 q0,q0,q4 ++ vst1.8 d4,[r2,: 64] ++ add r4,r4,#16 ++ vst1.8 d5,[r4,: 64] ++ vtrn.32 d10,d6 ++ vtrn.32 d11,d7 ++ sub r2,r2,#8 ++ sub r4,r4,#8 ++ vtrn.32 d0,d2 ++ vtrn.32 d1,d3 ++ vst1.8 d10,[r2,: 64] ++ vst1.8 d11,[r4,: 64] ++ sub r2,r2,#24 ++ sub r4,r4,#24 ++ vst1.8 d0,[r2,: 64] ++ vst1.8 d1,[r4,: 64] ++ add r2,r3,#288 ++ add r4,r3,#336 ++ vld1.8 {d0-d1},[r2,: 128]! ++ vld1.8 {d2-d3},[r4,: 128]! ++ vsub.i32 q0,q0,q1 ++ vld1.8 {d2-d3},[r2,: 128]! ++ vld1.8 {d4-d5},[r4,: 128]! ++ vsub.i32 q1,q1,q2 ++ add r5,r3,#240 ++ vld1.8 {d4},[r2,: 64] ++ vld1.8 {d6},[r4,: 64] ++ vsub.i32 q2,q2,q3 ++ vst1.8 {d0-d1},[r5,: 128]! ++ vst1.8 {d2-d3},[r5,: 128]! ++ vst1.8 d4,[r5,: 64] ++ add r2,r3,#144 ++ add r4,r3,#96 ++ add r5,r3,#144 ++ add r6,r3,#192 ++ vld1.8 {d0-d1},[r2,: 128]! ++ vld1.8 {d2-d3},[r4,: 128]! ++ vsub.i32 q2,q0,q1 ++ vadd.i32 q0,q0,q1 ++ vld1.8 {d2-d3},[r2,: 128]! ++ vld1.8 {d6-d7},[r4,: 128]! ++ vsub.i32 q4,q1,q3 ++ vadd.i32 q1,q1,q3 ++ vld1.8 {d6},[r2,: 64] ++ vld1.8 {d10},[r4,: 64] ++ vsub.i32 q6,q3,q5 ++ vadd.i32 q3,q3,q5 ++ vst1.8 {d4-d5},[r5,: 128]! ++ vst1.8 {d0-d1},[r6,: 128]! ++ vst1.8 {d8-d9},[r5,: 128]! ++ vst1.8 {d2-d3},[r6,: 128]! ++ vst1.8 d12,[r5,: 64] ++ vst1.8 d6,[r6,: 64] ++ add r2,r3,#0 ++ add r4,r3,#240 ++ vld1.8 {d0-d1},[r4,: 128]! ++ vld1.8 {d2-d3},[r4,: 128]! ++ vld1.8 {d4},[r4,: 64] ++ add r4,r3,#336 ++ vld1.8 {d6-d7},[r4,: 128]! ++ vtrn.32 q0,q3 ++ vld1.8 {d8-d9},[r4,: 128]! ++ vshl.i32 q5,q0,#4 ++ vtrn.32 q1,q4 ++ vshl.i32 q6,q3,#4 ++ vadd.i32 q5,q5,q0 ++ vadd.i32 q6,q6,q3 ++ vshl.i32 q7,q1,#4 ++ vld1.8 {d5},[r4,: 64] ++ vshl.i32 q8,q4,#4 ++ vtrn.32 d4,d5 ++ vadd.i32 q7,q7,q1 ++ vadd.i32 q8,q8,q4 ++ vld1.8 {d18-d19},[r2,: 128]! ++ vshl.i32 q10,q2,#4 ++ vld1.8 {d22-d23},[r2,: 128]! ++ vadd.i32 q10,q10,q2 ++ vld1.8 {d24},[r2,: 64] ++ vadd.i32 q5,q5,q0 ++ add r2,r3,#288 ++ vld1.8 {d26-d27},[r2,: 128]! ++ vadd.i32 q6,q6,q3 ++ vld1.8 {d28-d29},[r2,: 128]! ++ vadd.i32 q8,q8,q4 ++ vld1.8 {d25},[r2,: 64] ++ vadd.i32 q10,q10,q2 ++ vtrn.32 q9,q13 ++ vadd.i32 q7,q7,q1 ++ vadd.i32 q5,q5,q0 ++ vtrn.32 q11,q14 ++ vadd.i32 q6,q6,q3 ++ add r2,sp,#560 ++ vadd.i32 q10,q10,q2 ++ vtrn.32 d24,d25 ++ vst1.8 {d12-d13},[r2,: 128] ++ vshl.i32 q6,q13,#1 ++ add r2,sp,#576 ++ vst1.8 {d20-d21},[r2,: 128] ++ vshl.i32 q10,q14,#1 ++ add r2,sp,#592 ++ vst1.8 {d12-d13},[r2,: 128] ++ vshl.i32 q15,q12,#1 ++ vadd.i32 q8,q8,q4 ++ vext.32 d10,d31,d30,#0 ++ vadd.i32 q7,q7,q1 ++ add r2,sp,#608 ++ vst1.8 {d16-d17},[r2,: 128] ++ vmull.s32 q8,d18,d5 ++ vmlal.s32 q8,d26,d4 ++ vmlal.s32 q8,d19,d9 ++ vmlal.s32 q8,d27,d3 ++ vmlal.s32 q8,d22,d8 ++ vmlal.s32 q8,d28,d2 ++ vmlal.s32 q8,d23,d7 ++ vmlal.s32 q8,d29,d1 ++ vmlal.s32 q8,d24,d6 ++ vmlal.s32 q8,d25,d0 ++ add r2,sp,#624 ++ vst1.8 {d14-d15},[r2,: 128] ++ vmull.s32 q2,d18,d4 ++ vmlal.s32 q2,d12,d9 ++ vmlal.s32 q2,d13,d8 ++ vmlal.s32 q2,d19,d3 ++ vmlal.s32 q2,d22,d2 ++ vmlal.s32 q2,d23,d1 ++ vmlal.s32 q2,d24,d0 ++ add r2,sp,#640 ++ vst1.8 {d20-d21},[r2,: 128] ++ vmull.s32 q7,d18,d9 ++ vmlal.s32 q7,d26,d3 ++ vmlal.s32 q7,d19,d8 ++ vmlal.s32 q7,d27,d2 ++ vmlal.s32 q7,d22,d7 ++ vmlal.s32 q7,d28,d1 ++ vmlal.s32 q7,d23,d6 ++ vmlal.s32 q7,d29,d0 ++ add r2,sp,#656 ++ vst1.8 {d10-d11},[r2,: 128] ++ vmull.s32 q5,d18,d3 ++ vmlal.s32 q5,d19,d2 ++ vmlal.s32 q5,d22,d1 ++ vmlal.s32 q5,d23,d0 ++ vmlal.s32 q5,d12,d8 ++ add r2,sp,#672 ++ vst1.8 {d16-d17},[r2,: 128] ++ vmull.s32 q4,d18,d8 ++ vmlal.s32 q4,d26,d2 ++ vmlal.s32 q4,d19,d7 ++ vmlal.s32 q4,d27,d1 ++ vmlal.s32 q4,d22,d6 ++ vmlal.s32 q4,d28,d0 ++ vmull.s32 q8,d18,d7 ++ vmlal.s32 q8,d26,d1 ++ vmlal.s32 q8,d19,d6 ++ vmlal.s32 q8,d27,d0 ++ add r2,sp,#576 ++ vld1.8 {d20-d21},[r2,: 128] ++ vmlal.s32 q7,d24,d21 ++ vmlal.s32 q7,d25,d20 ++ vmlal.s32 q4,d23,d21 ++ vmlal.s32 q4,d29,d20 ++ vmlal.s32 q8,d22,d21 ++ vmlal.s32 q8,d28,d20 ++ vmlal.s32 q5,d24,d20 ++ add r2,sp,#576 ++ vst1.8 {d14-d15},[r2,: 128] ++ vmull.s32 q7,d18,d6 ++ vmlal.s32 q7,d26,d0 ++ add r2,sp,#656 ++ vld1.8 {d30-d31},[r2,: 128] ++ vmlal.s32 q2,d30,d21 ++ vmlal.s32 q7,d19,d21 ++ vmlal.s32 q7,d27,d20 ++ add r2,sp,#624 ++ vld1.8 {d26-d27},[r2,: 128] ++ vmlal.s32 q4,d25,d27 ++ vmlal.s32 q8,d29,d27 ++ vmlal.s32 q8,d25,d26 ++ vmlal.s32 q7,d28,d27 ++ vmlal.s32 q7,d29,d26 ++ add r2,sp,#608 ++ vld1.8 {d28-d29},[r2,: 128] ++ vmlal.s32 q4,d24,d29 ++ vmlal.s32 q8,d23,d29 ++ vmlal.s32 q8,d24,d28 ++ vmlal.s32 q7,d22,d29 ++ vmlal.s32 q7,d23,d28 ++ add r2,sp,#608 ++ vst1.8 {d8-d9},[r2,: 128] ++ add r2,sp,#560 ++ vld1.8 {d8-d9},[r2,: 128] ++ vmlal.s32 q7,d24,d9 ++ vmlal.s32 q7,d25,d31 ++ vmull.s32 q1,d18,d2 ++ vmlal.s32 q1,d19,d1 ++ vmlal.s32 q1,d22,d0 ++ vmlal.s32 q1,d24,d27 ++ vmlal.s32 q1,d23,d20 ++ vmlal.s32 q1,d12,d7 ++ vmlal.s32 q1,d13,d6 ++ vmull.s32 q6,d18,d1 ++ vmlal.s32 q6,d19,d0 ++ vmlal.s32 q6,d23,d27 ++ vmlal.s32 q6,d22,d20 ++ vmlal.s32 q6,d24,d26 ++ vmull.s32 q0,d18,d0 ++ vmlal.s32 q0,d22,d27 ++ vmlal.s32 q0,d23,d26 ++ vmlal.s32 q0,d24,d31 ++ vmlal.s32 q0,d19,d20 ++ add r2,sp,#640 ++ vld1.8 {d18-d19},[r2,: 128] ++ vmlal.s32 q2,d18,d7 ++ vmlal.s32 q2,d19,d6 ++ vmlal.s32 q5,d18,d6 ++ vmlal.s32 q5,d19,d21 ++ vmlal.s32 q1,d18,d21 ++ vmlal.s32 q1,d19,d29 ++ vmlal.s32 q0,d18,d28 ++ vmlal.s32 q0,d19,d9 ++ vmlal.s32 q6,d18,d29 ++ vmlal.s32 q6,d19,d28 ++ add r2,sp,#592 ++ vld1.8 {d18-d19},[r2,: 128] ++ add r2,sp,#512 ++ vld1.8 {d22-d23},[r2,: 128] ++ vmlal.s32 q5,d19,d7 ++ vmlal.s32 q0,d18,d21 ++ vmlal.s32 q0,d19,d29 ++ vmlal.s32 q6,d18,d6 ++ add r2,sp,#528 ++ vld1.8 {d6-d7},[r2,: 128] ++ vmlal.s32 q6,d19,d21 ++ add r2,sp,#576 ++ vld1.8 {d18-d19},[r2,: 128] ++ vmlal.s32 q0,d30,d8 ++ add r2,sp,#672 ++ vld1.8 {d20-d21},[r2,: 128] ++ vmlal.s32 q5,d30,d29 ++ add r2,sp,#608 ++ vld1.8 {d24-d25},[r2,: 128] ++ vmlal.s32 q1,d30,d28 ++ vadd.i64 q13,q0,q11 ++ vadd.i64 q14,q5,q11 ++ vmlal.s32 q6,d30,d9 ++ vshr.s64 q4,q13,#26 ++ vshr.s64 q13,q14,#26 ++ vadd.i64 q7,q7,q4 ++ vshl.i64 q4,q4,#26 ++ vadd.i64 q14,q7,q3 ++ vadd.i64 q9,q9,q13 ++ vshl.i64 q13,q13,#26 ++ vadd.i64 q15,q9,q3 ++ vsub.i64 q0,q0,q4 ++ vshr.s64 q4,q14,#25 ++ vsub.i64 q5,q5,q13 ++ vshr.s64 q13,q15,#25 ++ vadd.i64 q6,q6,q4 ++ vshl.i64 q4,q4,#25 ++ vadd.i64 q14,q6,q11 ++ vadd.i64 q2,q2,q13 ++ vsub.i64 q4,q7,q4 ++ vshr.s64 q7,q14,#26 ++ vshl.i64 q13,q13,#25 ++ vadd.i64 q14,q2,q11 ++ vadd.i64 q8,q8,q7 ++ vshl.i64 q7,q7,#26 ++ vadd.i64 q15,q8,q3 ++ vsub.i64 q9,q9,q13 ++ vshr.s64 q13,q14,#26 ++ vsub.i64 q6,q6,q7 ++ vshr.s64 q7,q15,#25 ++ vadd.i64 q10,q10,q13 ++ vshl.i64 q13,q13,#26 ++ vadd.i64 q14,q10,q3 ++ vadd.i64 q1,q1,q7 ++ add r2,r3,#288 ++ vshl.i64 q7,q7,#25 ++ add r4,r3,#96 ++ vadd.i64 q15,q1,q11 ++ add r2,r2,#8 ++ vsub.i64 q2,q2,q13 ++ add r4,r4,#8 ++ vshr.s64 q13,q14,#25 ++ vsub.i64 q7,q8,q7 ++ vshr.s64 q8,q15,#26 ++ vadd.i64 q14,q13,q13 ++ vadd.i64 q12,q12,q8 ++ vtrn.32 d12,d14 ++ vshl.i64 q8,q8,#26 ++ vtrn.32 d13,d15 ++ vadd.i64 q3,q12,q3 ++ vadd.i64 q0,q0,q14 ++ vst1.8 d12,[r2,: 64]! ++ vshl.i64 q7,q13,#4 ++ vst1.8 d13,[r4,: 64]! ++ vsub.i64 q1,q1,q8 ++ vshr.s64 q3,q3,#25 ++ vadd.i64 q0,q0,q7 ++ vadd.i64 q5,q5,q3 ++ vshl.i64 q3,q3,#25 ++ vadd.i64 q6,q5,q11 ++ vadd.i64 q0,q0,q13 ++ vshl.i64 q7,q13,#25 ++ vadd.i64 q8,q0,q11 ++ vsub.i64 q3,q12,q3 ++ vshr.s64 q6,q6,#26 ++ vsub.i64 q7,q10,q7 ++ vtrn.32 d2,d6 ++ vshr.s64 q8,q8,#26 ++ vtrn.32 d3,d7 ++ vadd.i64 q3,q9,q6 ++ vst1.8 d2,[r2,: 64] ++ vshl.i64 q6,q6,#26 ++ vst1.8 d3,[r4,: 64] ++ vadd.i64 q1,q4,q8 ++ vtrn.32 d4,d14 ++ vshl.i64 q4,q8,#26 ++ vtrn.32 d5,d15 ++ vsub.i64 q5,q5,q6 ++ add r2,r2,#16 ++ vsub.i64 q0,q0,q4 ++ vst1.8 d4,[r2,: 64] ++ add r4,r4,#16 ++ vst1.8 d5,[r4,: 64] ++ vtrn.32 d10,d6 ++ vtrn.32 d11,d7 ++ sub r2,r2,#8 ++ sub r4,r4,#8 ++ vtrn.32 d0,d2 ++ vtrn.32 d1,d3 ++ vst1.8 d10,[r2,: 64] ++ vst1.8 d11,[r4,: 64] ++ sub r2,r2,#24 ++ sub r4,r4,#24 ++ vst1.8 d0,[r2,: 64] ++ vst1.8 d1,[r4,: 64] ++ add r2,sp,#544 ++ add r4,r3,#144 ++ add r5,r3,#192 ++ vld1.8 {d0-d1},[r2,: 128] ++ vld1.8 {d2-d3},[r4,: 128]! ++ vld1.8 {d4-d5},[r5,: 128]! ++ vzip.i32 q1,q2 ++ vld1.8 {d6-d7},[r4,: 128]! ++ vld1.8 {d8-d9},[r5,: 128]! ++ vshl.i32 q5,q1,#1 ++ vzip.i32 q3,q4 ++ vshl.i32 q6,q2,#1 ++ vld1.8 {d14},[r4,: 64] ++ vshl.i32 q8,q3,#1 ++ vld1.8 {d15},[r5,: 64] ++ vshl.i32 q9,q4,#1 ++ vmul.i32 d21,d7,d1 ++ vtrn.32 d14,d15 ++ vmul.i32 q11,q4,q0 ++ vmul.i32 q0,q7,q0 ++ vmull.s32 q12,d2,d2 ++ vmlal.s32 q12,d11,d1 ++ vmlal.s32 q12,d12,d0 ++ vmlal.s32 q12,d13,d23 ++ vmlal.s32 q12,d16,d22 ++ vmlal.s32 q12,d7,d21 ++ vmull.s32 q10,d2,d11 ++ vmlal.s32 q10,d4,d1 ++ vmlal.s32 q10,d13,d0 ++ vmlal.s32 q10,d6,d23 ++ vmlal.s32 q10,d17,d22 ++ vmull.s32 q13,d10,d4 ++ vmlal.s32 q13,d11,d3 ++ vmlal.s32 q13,d13,d1 ++ vmlal.s32 q13,d16,d0 ++ vmlal.s32 q13,d17,d23 ++ vmlal.s32 q13,d8,d22 ++ vmull.s32 q1,d10,d5 ++ vmlal.s32 q1,d11,d4 ++ vmlal.s32 q1,d6,d1 ++ vmlal.s32 q1,d17,d0 ++ vmlal.s32 q1,d8,d23 ++ vmull.s32 q14,d10,d6 ++ vmlal.s32 q14,d11,d13 ++ vmlal.s32 q14,d4,d4 ++ vmlal.s32 q14,d17,d1 ++ vmlal.s32 q14,d18,d0 ++ vmlal.s32 q14,d9,d23 ++ vmull.s32 q11,d10,d7 ++ vmlal.s32 q11,d11,d6 ++ vmlal.s32 q11,d12,d5 ++ vmlal.s32 q11,d8,d1 ++ vmlal.s32 q11,d19,d0 ++ vmull.s32 q15,d10,d8 ++ vmlal.s32 q15,d11,d17 ++ vmlal.s32 q15,d12,d6 ++ vmlal.s32 q15,d13,d5 ++ vmlal.s32 q15,d19,d1 ++ vmlal.s32 q15,d14,d0 ++ vmull.s32 q2,d10,d9 ++ vmlal.s32 q2,d11,d8 ++ vmlal.s32 q2,d12,d7 ++ vmlal.s32 q2,d13,d6 ++ vmlal.s32 q2,d14,d1 ++ vmull.s32 q0,d15,d1 ++ vmlal.s32 q0,d10,d14 ++ vmlal.s32 q0,d11,d19 ++ vmlal.s32 q0,d12,d8 ++ vmlal.s32 q0,d13,d17 ++ vmlal.s32 q0,d6,d6 ++ add r2,sp,#512 ++ vld1.8 {d18-d19},[r2,: 128] ++ vmull.s32 q3,d16,d7 ++ vmlal.s32 q3,d10,d15 ++ vmlal.s32 q3,d11,d14 ++ vmlal.s32 q3,d12,d9 ++ vmlal.s32 q3,d13,d8 ++ add r2,sp,#528 ++ vld1.8 {d8-d9},[r2,: 128] ++ vadd.i64 q5,q12,q9 ++ vadd.i64 q6,q15,q9 ++ vshr.s64 q5,q5,#26 ++ vshr.s64 q6,q6,#26 ++ vadd.i64 q7,q10,q5 ++ vshl.i64 q5,q5,#26 ++ vadd.i64 q8,q7,q4 ++ vadd.i64 q2,q2,q6 ++ vshl.i64 q6,q6,#26 ++ vadd.i64 q10,q2,q4 ++ vsub.i64 q5,q12,q5 ++ vshr.s64 q8,q8,#25 ++ vsub.i64 q6,q15,q6 ++ vshr.s64 q10,q10,#25 ++ vadd.i64 q12,q13,q8 ++ vshl.i64 q8,q8,#25 ++ vadd.i64 q13,q12,q9 ++ vadd.i64 q0,q0,q10 ++ vsub.i64 q7,q7,q8 ++ vshr.s64 q8,q13,#26 ++ vshl.i64 q10,q10,#25 ++ vadd.i64 q13,q0,q9 ++ vadd.i64 q1,q1,q8 ++ vshl.i64 q8,q8,#26 ++ vadd.i64 q15,q1,q4 ++ vsub.i64 q2,q2,q10 ++ vshr.s64 q10,q13,#26 ++ vsub.i64 q8,q12,q8 ++ vshr.s64 q12,q15,#25 ++ vadd.i64 q3,q3,q10 ++ vshl.i64 q10,q10,#26 ++ vadd.i64 q13,q3,q4 ++ vadd.i64 q14,q14,q12 ++ add r2,r3,#144 ++ vshl.i64 q12,q12,#25 ++ add r4,r3,#192 ++ vadd.i64 q15,q14,q9 ++ add r2,r2,#8 ++ vsub.i64 q0,q0,q10 ++ add r4,r4,#8 ++ vshr.s64 q10,q13,#25 ++ vsub.i64 q1,q1,q12 ++ vshr.s64 q12,q15,#26 ++ vadd.i64 q13,q10,q10 ++ vadd.i64 q11,q11,q12 ++ vtrn.32 d16,d2 ++ vshl.i64 q12,q12,#26 ++ vtrn.32 d17,d3 ++ vadd.i64 q1,q11,q4 ++ vadd.i64 q4,q5,q13 ++ vst1.8 d16,[r2,: 64]! ++ vshl.i64 q5,q10,#4 ++ vst1.8 d17,[r4,: 64]! ++ vsub.i64 q8,q14,q12 ++ vshr.s64 q1,q1,#25 ++ vadd.i64 q4,q4,q5 ++ vadd.i64 q5,q6,q1 ++ vshl.i64 q1,q1,#25 ++ vadd.i64 q6,q5,q9 ++ vadd.i64 q4,q4,q10 ++ vshl.i64 q10,q10,#25 ++ vadd.i64 q9,q4,q9 ++ vsub.i64 q1,q11,q1 ++ vshr.s64 q6,q6,#26 ++ vsub.i64 q3,q3,q10 ++ vtrn.32 d16,d2 ++ vshr.s64 q9,q9,#26 ++ vtrn.32 d17,d3 ++ vadd.i64 q1,q2,q6 ++ vst1.8 d16,[r2,: 64] ++ vshl.i64 q2,q6,#26 ++ vst1.8 d17,[r4,: 64] ++ vadd.i64 q6,q7,q9 ++ vtrn.32 d0,d6 ++ vshl.i64 q7,q9,#26 ++ vtrn.32 d1,d7 ++ vsub.i64 q2,q5,q2 ++ add r2,r2,#16 ++ vsub.i64 q3,q4,q7 ++ vst1.8 d0,[r2,: 64] ++ add r4,r4,#16 ++ vst1.8 d1,[r4,: 64] ++ vtrn.32 d4,d2 ++ vtrn.32 d5,d3 ++ sub r2,r2,#8 ++ sub r4,r4,#8 ++ vtrn.32 d6,d12 ++ vtrn.32 d7,d13 ++ vst1.8 d4,[r2,: 64] ++ vst1.8 d5,[r4,: 64] ++ sub r2,r2,#24 ++ sub r4,r4,#24 ++ vst1.8 d6,[r2,: 64] ++ vst1.8 d7,[r4,: 64] ++ add r2,r3,#336 ++ add r4,r3,#288 ++ vld1.8 {d0-d1},[r2,: 128]! ++ vld1.8 {d2-d3},[r4,: 128]! ++ vadd.i32 q0,q0,q1 ++ vld1.8 {d2-d3},[r2,: 128]! ++ vld1.8 {d4-d5},[r4,: 128]! ++ vadd.i32 q1,q1,q2 ++ add r5,r3,#288 ++ vld1.8 {d4},[r2,: 64] ++ vld1.8 {d6},[r4,: 64] ++ vadd.i32 q2,q2,q3 ++ vst1.8 {d0-d1},[r5,: 128]! ++ vst1.8 {d2-d3},[r5,: 128]! ++ vst1.8 d4,[r5,: 64] ++ add r2,r3,#48 ++ add r4,r3,#144 ++ vld1.8 {d0-d1},[r4,: 128]! ++ vld1.8 {d2-d3},[r4,: 128]! ++ vld1.8 {d4},[r4,: 64] ++ add r4,r3,#288 ++ vld1.8 {d6-d7},[r4,: 128]! ++ vtrn.32 q0,q3 ++ vld1.8 {d8-d9},[r4,: 128]! ++ vshl.i32 q5,q0,#4 ++ vtrn.32 q1,q4 ++ vshl.i32 q6,q3,#4 ++ vadd.i32 q5,q5,q0 ++ vadd.i32 q6,q6,q3 ++ vshl.i32 q7,q1,#4 ++ vld1.8 {d5},[r4,: 64] ++ vshl.i32 q8,q4,#4 ++ vtrn.32 d4,d5 ++ vadd.i32 q7,q7,q1 ++ vadd.i32 q8,q8,q4 ++ vld1.8 {d18-d19},[r2,: 128]! ++ vshl.i32 q10,q2,#4 ++ vld1.8 {d22-d23},[r2,: 128]! ++ vadd.i32 q10,q10,q2 ++ vld1.8 {d24},[r2,: 64] ++ vadd.i32 q5,q5,q0 ++ add r2,r3,#240 ++ vld1.8 {d26-d27},[r2,: 128]! ++ vadd.i32 q6,q6,q3 ++ vld1.8 {d28-d29},[r2,: 128]! ++ vadd.i32 q8,q8,q4 ++ vld1.8 {d25},[r2,: 64] ++ vadd.i32 q10,q10,q2 ++ vtrn.32 q9,q13 ++ vadd.i32 q7,q7,q1 ++ vadd.i32 q5,q5,q0 ++ vtrn.32 q11,q14 ++ vadd.i32 q6,q6,q3 ++ add r2,sp,#560 ++ vadd.i32 q10,q10,q2 ++ vtrn.32 d24,d25 ++ vst1.8 {d12-d13},[r2,: 128] ++ vshl.i32 q6,q13,#1 ++ add r2,sp,#576 ++ vst1.8 {d20-d21},[r2,: 128] ++ vshl.i32 q10,q14,#1 ++ add r2,sp,#592 ++ vst1.8 {d12-d13},[r2,: 128] ++ vshl.i32 q15,q12,#1 ++ vadd.i32 q8,q8,q4 ++ vext.32 d10,d31,d30,#0 ++ vadd.i32 q7,q7,q1 ++ add r2,sp,#608 ++ vst1.8 {d16-d17},[r2,: 128] ++ vmull.s32 q8,d18,d5 ++ vmlal.s32 q8,d26,d4 ++ vmlal.s32 q8,d19,d9 ++ vmlal.s32 q8,d27,d3 ++ vmlal.s32 q8,d22,d8 ++ vmlal.s32 q8,d28,d2 ++ vmlal.s32 q8,d23,d7 ++ vmlal.s32 q8,d29,d1 ++ vmlal.s32 q8,d24,d6 ++ vmlal.s32 q8,d25,d0 ++ add r2,sp,#624 ++ vst1.8 {d14-d15},[r2,: 128] ++ vmull.s32 q2,d18,d4 ++ vmlal.s32 q2,d12,d9 ++ vmlal.s32 q2,d13,d8 ++ vmlal.s32 q2,d19,d3 ++ vmlal.s32 q2,d22,d2 ++ vmlal.s32 q2,d23,d1 ++ vmlal.s32 q2,d24,d0 ++ add r2,sp,#640 ++ vst1.8 {d20-d21},[r2,: 128] ++ vmull.s32 q7,d18,d9 ++ vmlal.s32 q7,d26,d3 ++ vmlal.s32 q7,d19,d8 ++ vmlal.s32 q7,d27,d2 ++ vmlal.s32 q7,d22,d7 ++ vmlal.s32 q7,d28,d1 ++ vmlal.s32 q7,d23,d6 ++ vmlal.s32 q7,d29,d0 ++ add r2,sp,#656 ++ vst1.8 {d10-d11},[r2,: 128] ++ vmull.s32 q5,d18,d3 ++ vmlal.s32 q5,d19,d2 ++ vmlal.s32 q5,d22,d1 ++ vmlal.s32 q5,d23,d0 ++ vmlal.s32 q5,d12,d8 ++ add r2,sp,#672 ++ vst1.8 {d16-d17},[r2,: 128] ++ vmull.s32 q4,d18,d8 ++ vmlal.s32 q4,d26,d2 ++ vmlal.s32 q4,d19,d7 ++ vmlal.s32 q4,d27,d1 ++ vmlal.s32 q4,d22,d6 ++ vmlal.s32 q4,d28,d0 ++ vmull.s32 q8,d18,d7 ++ vmlal.s32 q8,d26,d1 ++ vmlal.s32 q8,d19,d6 ++ vmlal.s32 q8,d27,d0 ++ add r2,sp,#576 ++ vld1.8 {d20-d21},[r2,: 128] ++ vmlal.s32 q7,d24,d21 ++ vmlal.s32 q7,d25,d20 ++ vmlal.s32 q4,d23,d21 ++ vmlal.s32 q4,d29,d20 ++ vmlal.s32 q8,d22,d21 ++ vmlal.s32 q8,d28,d20 ++ vmlal.s32 q5,d24,d20 ++ add r2,sp,#576 ++ vst1.8 {d14-d15},[r2,: 128] ++ vmull.s32 q7,d18,d6 ++ vmlal.s32 q7,d26,d0 ++ add r2,sp,#656 ++ vld1.8 {d30-d31},[r2,: 128] ++ vmlal.s32 q2,d30,d21 ++ vmlal.s32 q7,d19,d21 ++ vmlal.s32 q7,d27,d20 ++ add r2,sp,#624 ++ vld1.8 {d26-d27},[r2,: 128] ++ vmlal.s32 q4,d25,d27 ++ vmlal.s32 q8,d29,d27 ++ vmlal.s32 q8,d25,d26 ++ vmlal.s32 q7,d28,d27 ++ vmlal.s32 q7,d29,d26 ++ add r2,sp,#608 ++ vld1.8 {d28-d29},[r2,: 128] ++ vmlal.s32 q4,d24,d29 ++ vmlal.s32 q8,d23,d29 ++ vmlal.s32 q8,d24,d28 ++ vmlal.s32 q7,d22,d29 ++ vmlal.s32 q7,d23,d28 ++ add r2,sp,#608 ++ vst1.8 {d8-d9},[r2,: 128] ++ add r2,sp,#560 ++ vld1.8 {d8-d9},[r2,: 128] ++ vmlal.s32 q7,d24,d9 ++ vmlal.s32 q7,d25,d31 ++ vmull.s32 q1,d18,d2 ++ vmlal.s32 q1,d19,d1 ++ vmlal.s32 q1,d22,d0 ++ vmlal.s32 q1,d24,d27 ++ vmlal.s32 q1,d23,d20 ++ vmlal.s32 q1,d12,d7 ++ vmlal.s32 q1,d13,d6 ++ vmull.s32 q6,d18,d1 ++ vmlal.s32 q6,d19,d0 ++ vmlal.s32 q6,d23,d27 ++ vmlal.s32 q6,d22,d20 ++ vmlal.s32 q6,d24,d26 ++ vmull.s32 q0,d18,d0 ++ vmlal.s32 q0,d22,d27 ++ vmlal.s32 q0,d23,d26 ++ vmlal.s32 q0,d24,d31 ++ vmlal.s32 q0,d19,d20 ++ add r2,sp,#640 ++ vld1.8 {d18-d19},[r2,: 128] ++ vmlal.s32 q2,d18,d7 ++ vmlal.s32 q2,d19,d6 ++ vmlal.s32 q5,d18,d6 ++ vmlal.s32 q5,d19,d21 ++ vmlal.s32 q1,d18,d21 ++ vmlal.s32 q1,d19,d29 ++ vmlal.s32 q0,d18,d28 ++ vmlal.s32 q0,d19,d9 ++ vmlal.s32 q6,d18,d29 ++ vmlal.s32 q6,d19,d28 ++ add r2,sp,#592 ++ vld1.8 {d18-d19},[r2,: 128] ++ add r2,sp,#512 ++ vld1.8 {d22-d23},[r2,: 128] ++ vmlal.s32 q5,d19,d7 ++ vmlal.s32 q0,d18,d21 ++ vmlal.s32 q0,d19,d29 ++ vmlal.s32 q6,d18,d6 ++ add r2,sp,#528 ++ vld1.8 {d6-d7},[r2,: 128] ++ vmlal.s32 q6,d19,d21 ++ add r2,sp,#576 ++ vld1.8 {d18-d19},[r2,: 128] ++ vmlal.s32 q0,d30,d8 ++ add r2,sp,#672 ++ vld1.8 {d20-d21},[r2,: 128] ++ vmlal.s32 q5,d30,d29 ++ add r2,sp,#608 ++ vld1.8 {d24-d25},[r2,: 128] ++ vmlal.s32 q1,d30,d28 ++ vadd.i64 q13,q0,q11 ++ vadd.i64 q14,q5,q11 ++ vmlal.s32 q6,d30,d9 ++ vshr.s64 q4,q13,#26 ++ vshr.s64 q13,q14,#26 ++ vadd.i64 q7,q7,q4 ++ vshl.i64 q4,q4,#26 ++ vadd.i64 q14,q7,q3 ++ vadd.i64 q9,q9,q13 ++ vshl.i64 q13,q13,#26 ++ vadd.i64 q15,q9,q3 ++ vsub.i64 q0,q0,q4 ++ vshr.s64 q4,q14,#25 ++ vsub.i64 q5,q5,q13 ++ vshr.s64 q13,q15,#25 ++ vadd.i64 q6,q6,q4 ++ vshl.i64 q4,q4,#25 ++ vadd.i64 q14,q6,q11 ++ vadd.i64 q2,q2,q13 ++ vsub.i64 q4,q7,q4 ++ vshr.s64 q7,q14,#26 ++ vshl.i64 q13,q13,#25 ++ vadd.i64 q14,q2,q11 ++ vadd.i64 q8,q8,q7 ++ vshl.i64 q7,q7,#26 ++ vadd.i64 q15,q8,q3 ++ vsub.i64 q9,q9,q13 ++ vshr.s64 q13,q14,#26 ++ vsub.i64 q6,q6,q7 ++ vshr.s64 q7,q15,#25 ++ vadd.i64 q10,q10,q13 ++ vshl.i64 q13,q13,#26 ++ vadd.i64 q14,q10,q3 ++ vadd.i64 q1,q1,q7 ++ add r2,r3,#240 ++ vshl.i64 q7,q7,#25 ++ add r4,r3,#144 ++ vadd.i64 q15,q1,q11 ++ add r2,r2,#8 ++ vsub.i64 q2,q2,q13 ++ add r4,r4,#8 ++ vshr.s64 q13,q14,#25 ++ vsub.i64 q7,q8,q7 ++ vshr.s64 q8,q15,#26 ++ vadd.i64 q14,q13,q13 ++ vadd.i64 q12,q12,q8 ++ vtrn.32 d12,d14 ++ vshl.i64 q8,q8,#26 ++ vtrn.32 d13,d15 ++ vadd.i64 q3,q12,q3 ++ vadd.i64 q0,q0,q14 ++ vst1.8 d12,[r2,: 64]! ++ vshl.i64 q7,q13,#4 ++ vst1.8 d13,[r4,: 64]! ++ vsub.i64 q1,q1,q8 ++ vshr.s64 q3,q3,#25 ++ vadd.i64 q0,q0,q7 ++ vadd.i64 q5,q5,q3 ++ vshl.i64 q3,q3,#25 ++ vadd.i64 q6,q5,q11 ++ vadd.i64 q0,q0,q13 ++ vshl.i64 q7,q13,#25 ++ vadd.i64 q8,q0,q11 ++ vsub.i64 q3,q12,q3 ++ vshr.s64 q6,q6,#26 ++ vsub.i64 q7,q10,q7 ++ vtrn.32 d2,d6 ++ vshr.s64 q8,q8,#26 ++ vtrn.32 d3,d7 ++ vadd.i64 q3,q9,q6 ++ vst1.8 d2,[r2,: 64] ++ vshl.i64 q6,q6,#26 ++ vst1.8 d3,[r4,: 64] ++ vadd.i64 q1,q4,q8 ++ vtrn.32 d4,d14 ++ vshl.i64 q4,q8,#26 ++ vtrn.32 d5,d15 ++ vsub.i64 q5,q5,q6 ++ add r2,r2,#16 ++ vsub.i64 q0,q0,q4 ++ vst1.8 d4,[r2,: 64] ++ add r4,r4,#16 ++ vst1.8 d5,[r4,: 64] ++ vtrn.32 d10,d6 ++ vtrn.32 d11,d7 ++ sub r2,r2,#8 ++ sub r4,r4,#8 ++ vtrn.32 d0,d2 ++ vtrn.32 d1,d3 ++ vst1.8 d10,[r2,: 64] ++ vst1.8 d11,[r4,: 64] ++ sub r2,r2,#24 ++ sub r4,r4,#24 ++ vst1.8 d0,[r2,: 64] ++ vst1.8 d1,[r4,: 64] ++ ldr r2,[sp,#488] ++ ldr r4,[sp,#492] ++ subs r5,r2,#1 ++ bge .Lmainloop ++ add r1,r3,#144 ++ add r2,r3,#336 ++ vld1.8 {d0-d1},[r1,: 128]! ++ vld1.8 {d2-d3},[r1,: 128]! ++ vld1.8 {d4},[r1,: 64] ++ vst1.8 {d0-d1},[r2,: 128]! ++ vst1.8 {d2-d3},[r2,: 128]! ++ vst1.8 d4,[r2,: 64] ++ ldr r1,=0 ++ .Linvertloop: ++ add r2,r3,#144 ++ ldr r4,=0 ++ ldr r5,=2 ++ cmp r1,#1 ++ ldreq r5,=1 ++ addeq r2,r3,#336 ++ addeq r4,r3,#48 ++ cmp r1,#2 ++ ldreq r5,=1 ++ addeq r2,r3,#48 ++ cmp r1,#3 ++ ldreq r5,=5 ++ addeq r4,r3,#336 ++ cmp r1,#4 ++ ldreq r5,=10 ++ cmp r1,#5 ++ ldreq r5,=20 ++ cmp r1,#6 ++ ldreq r5,=10 ++ addeq r2,r3,#336 ++ addeq r4,r3,#336 ++ cmp r1,#7 ++ ldreq r5,=50 ++ cmp r1,#8 ++ ldreq r5,=100 ++ cmp r1,#9 ++ ldreq r5,=50 ++ addeq r2,r3,#336 ++ cmp r1,#10 ++ ldreq r5,=5 ++ addeq r2,r3,#48 ++ cmp r1,#11 ++ ldreq r5,=0 ++ addeq r2,r3,#96 ++ add r6,r3,#144 ++ add r7,r3,#288 ++ vld1.8 {d0-d1},[r6,: 128]! ++ vld1.8 {d2-d3},[r6,: 128]! ++ vld1.8 {d4},[r6,: 64] ++ vst1.8 {d0-d1},[r7,: 128]! ++ vst1.8 {d2-d3},[r7,: 128]! ++ vst1.8 d4,[r7,: 64] ++ cmp r5,#0 ++ beq .Lskipsquaringloop ++ .Lsquaringloop: ++ add r6,r3,#288 ++ add r7,r3,#288 ++ add r8,r3,#288 ++ vmov.i32 q0,#19 ++ vmov.i32 q1,#0 ++ vmov.i32 q2,#1 ++ vzip.i32 q1,q2 ++ vld1.8 {d4-d5},[r7,: 128]! ++ vld1.8 {d6-d7},[r7,: 128]! ++ vld1.8 {d9},[r7,: 64] ++ vld1.8 {d10-d11},[r6,: 128]! ++ add r7,sp,#416 ++ vld1.8 {d12-d13},[r6,: 128]! ++ vmul.i32 q7,q2,q0 ++ vld1.8 {d8},[r6,: 64] ++ vext.32 d17,d11,d10,#1 ++ vmul.i32 q9,q3,q0 ++ vext.32 d16,d10,d8,#1 ++ vshl.u32 q10,q5,q1 ++ vext.32 d22,d14,d4,#1 ++ vext.32 d24,d18,d6,#1 ++ vshl.u32 q13,q6,q1 ++ vshl.u32 d28,d8,d2 ++ vrev64.i32 d22,d22 ++ vmul.i32 d1,d9,d1 ++ vrev64.i32 d24,d24 ++ vext.32 d29,d8,d13,#1 ++ vext.32 d0,d1,d9,#1 ++ vrev64.i32 d0,d0 ++ vext.32 d2,d9,d1,#1 ++ vext.32 d23,d15,d5,#1 ++ vmull.s32 q4,d20,d4 ++ vrev64.i32 d23,d23 ++ vmlal.s32 q4,d21,d1 ++ vrev64.i32 d2,d2 ++ vmlal.s32 q4,d26,d19 ++ vext.32 d3,d5,d15,#1 ++ vmlal.s32 q4,d27,d18 ++ vrev64.i32 d3,d3 ++ vmlal.s32 q4,d28,d15 ++ vext.32 d14,d12,d11,#1 ++ vmull.s32 q5,d16,d23 ++ vext.32 d15,d13,d12,#1 ++ vmlal.s32 q5,d17,d4 ++ vst1.8 d8,[r7,: 64]! ++ vmlal.s32 q5,d14,d1 ++ vext.32 d12,d9,d8,#0 ++ vmlal.s32 q5,d15,d19 ++ vmov.i64 d13,#0 ++ vmlal.s32 q5,d29,d18 ++ vext.32 d25,d19,d7,#1 ++ vmlal.s32 q6,d20,d5 ++ vrev64.i32 d25,d25 ++ vmlal.s32 q6,d21,d4 ++ vst1.8 d11,[r7,: 64]! ++ vmlal.s32 q6,d26,d1 ++ vext.32 d9,d10,d10,#0 ++ vmlal.s32 q6,d27,d19 ++ vmov.i64 d8,#0 ++ vmlal.s32 q6,d28,d18 ++ vmlal.s32 q4,d16,d24 ++ vmlal.s32 q4,d17,d5 ++ vmlal.s32 q4,d14,d4 ++ vst1.8 d12,[r7,: 64]! ++ vmlal.s32 q4,d15,d1 ++ vext.32 d10,d13,d12,#0 ++ vmlal.s32 q4,d29,d19 ++ vmov.i64 d11,#0 ++ vmlal.s32 q5,d20,d6 ++ vmlal.s32 q5,d21,d5 ++ vmlal.s32 q5,d26,d4 ++ vext.32 d13,d8,d8,#0 ++ vmlal.s32 q5,d27,d1 ++ vmov.i64 d12,#0 ++ vmlal.s32 q5,d28,d19 ++ vst1.8 d9,[r7,: 64]! ++ vmlal.s32 q6,d16,d25 ++ vmlal.s32 q6,d17,d6 ++ vst1.8 d10,[r7,: 64] ++ vmlal.s32 q6,d14,d5 ++ vext.32 d8,d11,d10,#0 ++ vmlal.s32 q6,d15,d4 ++ vmov.i64 d9,#0 ++ vmlal.s32 q6,d29,d1 ++ vmlal.s32 q4,d20,d7 ++ vmlal.s32 q4,d21,d6 ++ vmlal.s32 q4,d26,d5 ++ vext.32 d11,d12,d12,#0 ++ vmlal.s32 q4,d27,d4 ++ vmov.i64 d10,#0 ++ vmlal.s32 q4,d28,d1 ++ vmlal.s32 q5,d16,d0 ++ sub r6,r7,#32 ++ vmlal.s32 q5,d17,d7 ++ vmlal.s32 q5,d14,d6 ++ vext.32 d30,d9,d8,#0 ++ vmlal.s32 q5,d15,d5 ++ vld1.8 {d31},[r6,: 64]! ++ vmlal.s32 q5,d29,d4 ++ vmlal.s32 q15,d20,d0 ++ vext.32 d0,d6,d18,#1 ++ vmlal.s32 q15,d21,d25 ++ vrev64.i32 d0,d0 ++ vmlal.s32 q15,d26,d24 ++ vext.32 d1,d7,d19,#1 ++ vext.32 d7,d10,d10,#0 ++ vmlal.s32 q15,d27,d23 ++ vrev64.i32 d1,d1 ++ vld1.8 {d6},[r6,: 64] ++ vmlal.s32 q15,d28,d22 ++ vmlal.s32 q3,d16,d4 ++ add r6,r6,#24 ++ vmlal.s32 q3,d17,d2 ++ vext.32 d4,d31,d30,#0 ++ vmov d17,d11 ++ vmlal.s32 q3,d14,d1 ++ vext.32 d11,d13,d13,#0 ++ vext.32 d13,d30,d30,#0 ++ vmlal.s32 q3,d15,d0 ++ vext.32 d1,d8,d8,#0 ++ vmlal.s32 q3,d29,d3 ++ vld1.8 {d5},[r6,: 64] ++ sub r6,r6,#16 ++ vext.32 d10,d6,d6,#0 ++ vmov.i32 q1,#0xffffffff ++ vshl.i64 q4,q1,#25 ++ add r7,sp,#512 ++ vld1.8 {d14-d15},[r7,: 128] ++ vadd.i64 q9,q2,q7 ++ vshl.i64 q1,q1,#26 ++ vshr.s64 q10,q9,#26 ++ vld1.8 {d0},[r6,: 64]! ++ vadd.i64 q5,q5,q10 ++ vand q9,q9,q1 ++ vld1.8 {d16},[r6,: 64]! ++ add r6,sp,#528 ++ vld1.8 {d20-d21},[r6,: 128] ++ vadd.i64 q11,q5,q10 ++ vsub.i64 q2,q2,q9 ++ vshr.s64 q9,q11,#25 ++ vext.32 d12,d5,d4,#0 ++ vand q11,q11,q4 ++ vadd.i64 q0,q0,q9 ++ vmov d19,d7 ++ vadd.i64 q3,q0,q7 ++ vsub.i64 q5,q5,q11 ++ vshr.s64 q11,q3,#26 ++ vext.32 d18,d11,d10,#0 ++ vand q3,q3,q1 ++ vadd.i64 q8,q8,q11 ++ vadd.i64 q11,q8,q10 ++ vsub.i64 q0,q0,q3 ++ vshr.s64 q3,q11,#25 ++ vand q11,q11,q4 ++ vadd.i64 q3,q6,q3 ++ vadd.i64 q6,q3,q7 ++ vsub.i64 q8,q8,q11 ++ vshr.s64 q11,q6,#26 ++ vand q6,q6,q1 ++ vadd.i64 q9,q9,q11 ++ vadd.i64 d25,d19,d21 ++ vsub.i64 q3,q3,q6 ++ vshr.s64 d23,d25,#25 ++ vand q4,q12,q4 ++ vadd.i64 d21,d23,d23 ++ vshl.i64 d25,d23,#4 ++ vadd.i64 d21,d21,d23 ++ vadd.i64 d25,d25,d21 ++ vadd.i64 d4,d4,d25 ++ vzip.i32 q0,q8 ++ vadd.i64 d12,d4,d14 ++ add r6,r8,#8 ++ vst1.8 d0,[r6,: 64] ++ vsub.i64 d19,d19,d9 ++ add r6,r6,#16 ++ vst1.8 d16,[r6,: 64] ++ vshr.s64 d22,d12,#26 ++ vand q0,q6,q1 ++ vadd.i64 d10,d10,d22 ++ vzip.i32 q3,q9 ++ vsub.i64 d4,d4,d0 ++ sub r6,r6,#8 ++ vst1.8 d6,[r6,: 64] ++ add r6,r6,#16 ++ vst1.8 d18,[r6,: 64] ++ vzip.i32 q2,q5 ++ sub r6,r6,#32 ++ vst1.8 d4,[r6,: 64] ++ subs r5,r5,#1 ++ bhi .Lsquaringloop ++ .Lskipsquaringloop: ++ mov r2,r2 ++ add r5,r3,#288 ++ add r6,r3,#144 ++ vmov.i32 q0,#19 ++ vmov.i32 q1,#0 ++ vmov.i32 q2,#1 ++ vzip.i32 q1,q2 ++ vld1.8 {d4-d5},[r5,: 128]! ++ vld1.8 {d6-d7},[r5,: 128]! ++ vld1.8 {d9},[r5,: 64] ++ vld1.8 {d10-d11},[r2,: 128]! ++ add r5,sp,#416 ++ vld1.8 {d12-d13},[r2,: 128]! ++ vmul.i32 q7,q2,q0 ++ vld1.8 {d8},[r2,: 64] ++ vext.32 d17,d11,d10,#1 ++ vmul.i32 q9,q3,q0 ++ vext.32 d16,d10,d8,#1 ++ vshl.u32 q10,q5,q1 ++ vext.32 d22,d14,d4,#1 ++ vext.32 d24,d18,d6,#1 ++ vshl.u32 q13,q6,q1 ++ vshl.u32 d28,d8,d2 ++ vrev64.i32 d22,d22 ++ vmul.i32 d1,d9,d1 ++ vrev64.i32 d24,d24 ++ vext.32 d29,d8,d13,#1 ++ vext.32 d0,d1,d9,#1 ++ vrev64.i32 d0,d0 ++ vext.32 d2,d9,d1,#1 ++ vext.32 d23,d15,d5,#1 ++ vmull.s32 q4,d20,d4 ++ vrev64.i32 d23,d23 ++ vmlal.s32 q4,d21,d1 ++ vrev64.i32 d2,d2 ++ vmlal.s32 q4,d26,d19 ++ vext.32 d3,d5,d15,#1 ++ vmlal.s32 q4,d27,d18 ++ vrev64.i32 d3,d3 ++ vmlal.s32 q4,d28,d15 ++ vext.32 d14,d12,d11,#1 ++ vmull.s32 q5,d16,d23 ++ vext.32 d15,d13,d12,#1 ++ vmlal.s32 q5,d17,d4 ++ vst1.8 d8,[r5,: 64]! ++ vmlal.s32 q5,d14,d1 ++ vext.32 d12,d9,d8,#0 ++ vmlal.s32 q5,d15,d19 ++ vmov.i64 d13,#0 ++ vmlal.s32 q5,d29,d18 ++ vext.32 d25,d19,d7,#1 ++ vmlal.s32 q6,d20,d5 ++ vrev64.i32 d25,d25 ++ vmlal.s32 q6,d21,d4 ++ vst1.8 d11,[r5,: 64]! ++ vmlal.s32 q6,d26,d1 ++ vext.32 d9,d10,d10,#0 ++ vmlal.s32 q6,d27,d19 ++ vmov.i64 d8,#0 ++ vmlal.s32 q6,d28,d18 ++ vmlal.s32 q4,d16,d24 ++ vmlal.s32 q4,d17,d5 ++ vmlal.s32 q4,d14,d4 ++ vst1.8 d12,[r5,: 64]! ++ vmlal.s32 q4,d15,d1 ++ vext.32 d10,d13,d12,#0 ++ vmlal.s32 q4,d29,d19 ++ vmov.i64 d11,#0 ++ vmlal.s32 q5,d20,d6 ++ vmlal.s32 q5,d21,d5 ++ vmlal.s32 q5,d26,d4 ++ vext.32 d13,d8,d8,#0 ++ vmlal.s32 q5,d27,d1 ++ vmov.i64 d12,#0 ++ vmlal.s32 q5,d28,d19 ++ vst1.8 d9,[r5,: 64]! ++ vmlal.s32 q6,d16,d25 ++ vmlal.s32 q6,d17,d6 ++ vst1.8 d10,[r5,: 64] ++ vmlal.s32 q6,d14,d5 ++ vext.32 d8,d11,d10,#0 ++ vmlal.s32 q6,d15,d4 ++ vmov.i64 d9,#0 ++ vmlal.s32 q6,d29,d1 ++ vmlal.s32 q4,d20,d7 ++ vmlal.s32 q4,d21,d6 ++ vmlal.s32 q4,d26,d5 ++ vext.32 d11,d12,d12,#0 ++ vmlal.s32 q4,d27,d4 ++ vmov.i64 d10,#0 ++ vmlal.s32 q4,d28,d1 ++ vmlal.s32 q5,d16,d0 ++ sub r2,r5,#32 ++ vmlal.s32 q5,d17,d7 ++ vmlal.s32 q5,d14,d6 ++ vext.32 d30,d9,d8,#0 ++ vmlal.s32 q5,d15,d5 ++ vld1.8 {d31},[r2,: 64]! ++ vmlal.s32 q5,d29,d4 ++ vmlal.s32 q15,d20,d0 ++ vext.32 d0,d6,d18,#1 ++ vmlal.s32 q15,d21,d25 ++ vrev64.i32 d0,d0 ++ vmlal.s32 q15,d26,d24 ++ vext.32 d1,d7,d19,#1 ++ vext.32 d7,d10,d10,#0 ++ vmlal.s32 q15,d27,d23 ++ vrev64.i32 d1,d1 ++ vld1.8 {d6},[r2,: 64] ++ vmlal.s32 q15,d28,d22 ++ vmlal.s32 q3,d16,d4 ++ add r2,r2,#24 ++ vmlal.s32 q3,d17,d2 ++ vext.32 d4,d31,d30,#0 ++ vmov d17,d11 ++ vmlal.s32 q3,d14,d1 ++ vext.32 d11,d13,d13,#0 ++ vext.32 d13,d30,d30,#0 ++ vmlal.s32 q3,d15,d0 ++ vext.32 d1,d8,d8,#0 ++ vmlal.s32 q3,d29,d3 ++ vld1.8 {d5},[r2,: 64] ++ sub r2,r2,#16 ++ vext.32 d10,d6,d6,#0 ++ vmov.i32 q1,#0xffffffff ++ vshl.i64 q4,q1,#25 ++ add r5,sp,#512 ++ vld1.8 {d14-d15},[r5,: 128] ++ vadd.i64 q9,q2,q7 ++ vshl.i64 q1,q1,#26 ++ vshr.s64 q10,q9,#26 ++ vld1.8 {d0},[r2,: 64]! ++ vadd.i64 q5,q5,q10 ++ vand q9,q9,q1 ++ vld1.8 {d16},[r2,: 64]! ++ add r2,sp,#528 ++ vld1.8 {d20-d21},[r2,: 128] ++ vadd.i64 q11,q5,q10 ++ vsub.i64 q2,q2,q9 ++ vshr.s64 q9,q11,#25 ++ vext.32 d12,d5,d4,#0 ++ vand q11,q11,q4 ++ vadd.i64 q0,q0,q9 ++ vmov d19,d7 ++ vadd.i64 q3,q0,q7 ++ vsub.i64 q5,q5,q11 ++ vshr.s64 q11,q3,#26 ++ vext.32 d18,d11,d10,#0 ++ vand q3,q3,q1 ++ vadd.i64 q8,q8,q11 ++ vadd.i64 q11,q8,q10 ++ vsub.i64 q0,q0,q3 ++ vshr.s64 q3,q11,#25 ++ vand q11,q11,q4 ++ vadd.i64 q3,q6,q3 ++ vadd.i64 q6,q3,q7 ++ vsub.i64 q8,q8,q11 ++ vshr.s64 q11,q6,#26 ++ vand q6,q6,q1 ++ vadd.i64 q9,q9,q11 ++ vadd.i64 d25,d19,d21 ++ vsub.i64 q3,q3,q6 ++ vshr.s64 d23,d25,#25 ++ vand q4,q12,q4 ++ vadd.i64 d21,d23,d23 ++ vshl.i64 d25,d23,#4 ++ vadd.i64 d21,d21,d23 ++ vadd.i64 d25,d25,d21 ++ vadd.i64 d4,d4,d25 ++ vzip.i32 q0,q8 ++ vadd.i64 d12,d4,d14 ++ add r2,r6,#8 ++ vst1.8 d0,[r2,: 64] ++ vsub.i64 d19,d19,d9 ++ add r2,r2,#16 ++ vst1.8 d16,[r2,: 64] ++ vshr.s64 d22,d12,#26 ++ vand q0,q6,q1 ++ vadd.i64 d10,d10,d22 ++ vzip.i32 q3,q9 ++ vsub.i64 d4,d4,d0 ++ sub r2,r2,#8 ++ vst1.8 d6,[r2,: 64] ++ add r2,r2,#16 ++ vst1.8 d18,[r2,: 64] ++ vzip.i32 q2,q5 ++ sub r2,r2,#32 ++ vst1.8 d4,[r2,: 64] ++ cmp r4,#0 ++ beq .Lskippostcopy ++ add r2,r3,#144 ++ mov r4,r4 ++ vld1.8 {d0-d1},[r2,: 128]! ++ vld1.8 {d2-d3},[r2,: 128]! ++ vld1.8 {d4},[r2,: 64] ++ vst1.8 {d0-d1},[r4,: 128]! ++ vst1.8 {d2-d3},[r4,: 128]! ++ vst1.8 d4,[r4,: 64] ++ .Lskippostcopy: ++ cmp r1,#1 ++ bne .Lskipfinalcopy ++ add r2,r3,#288 ++ add r4,r3,#144 ++ vld1.8 {d0-d1},[r2,: 128]! ++ vld1.8 {d2-d3},[r2,: 128]! ++ vld1.8 {d4},[r2,: 64] ++ vst1.8 {d0-d1},[r4,: 128]! ++ vst1.8 {d2-d3},[r4,: 128]! ++ vst1.8 d4,[r4,: 64] ++ .Lskipfinalcopy: ++ add r1,r1,#1 ++ cmp r1,#12 ++ blo .Linvertloop ++ add r1,r3,#144 ++ ldr r2,[r1],#4 ++ ldr r3,[r1],#4 ++ ldr r4,[r1],#4 ++ ldr r5,[r1],#4 ++ ldr r6,[r1],#4 ++ ldr r7,[r1],#4 ++ ldr r8,[r1],#4 ++ ldr r9,[r1],#4 ++ ldr r10,[r1],#4 ++ ldr r1,[r1] ++ add r11,r1,r1,LSL #4 ++ add r11,r11,r1,LSL #1 ++ add r11,r11,#16777216 ++ mov r11,r11,ASR #25 ++ add r11,r11,r2 ++ mov r11,r11,ASR #26 ++ add r11,r11,r3 ++ mov r11,r11,ASR #25 ++ add r11,r11,r4 ++ mov r11,r11,ASR #26 ++ add r11,r11,r5 ++ mov r11,r11,ASR #25 ++ add r11,r11,r6 ++ mov r11,r11,ASR #26 ++ add r11,r11,r7 ++ mov r11,r11,ASR #25 ++ add r11,r11,r8 ++ mov r11,r11,ASR #26 ++ add r11,r11,r9 ++ mov r11,r11,ASR #25 ++ add r11,r11,r10 ++ mov r11,r11,ASR #26 ++ add r11,r11,r1 ++ mov r11,r11,ASR #25 ++ add r2,r2,r11 ++ add r2,r2,r11,LSL #1 ++ add r2,r2,r11,LSL #4 ++ mov r11,r2,ASR #26 ++ add r3,r3,r11 ++ sub r2,r2,r11,LSL #26 ++ mov r11,r3,ASR #25 ++ add r4,r4,r11 ++ sub r3,r3,r11,LSL #25 ++ mov r11,r4,ASR #26 ++ add r5,r5,r11 ++ sub r4,r4,r11,LSL #26 ++ mov r11,r5,ASR #25 ++ add r6,r6,r11 ++ sub r5,r5,r11,LSL #25 ++ mov r11,r6,ASR #26 ++ add r7,r7,r11 ++ sub r6,r6,r11,LSL #26 ++ mov r11,r7,ASR #25 ++ add r8,r8,r11 ++ sub r7,r7,r11,LSL #25 ++ mov r11,r8,ASR #26 ++ add r9,r9,r11 ++ sub r8,r8,r11,LSL #26 ++ mov r11,r9,ASR #25 ++ add r10,r10,r11 ++ sub r9,r9,r11,LSL #25 ++ mov r11,r10,ASR #26 ++ add r1,r1,r11 ++ sub r10,r10,r11,LSL #26 ++ mov r11,r1,ASR #25 ++ sub r1,r1,r11,LSL #25 ++ add r2,r2,r3,LSL #26 ++ mov r3,r3,LSR #6 ++ add r3,r3,r4,LSL #19 ++ mov r4,r4,LSR #13 ++ add r4,r4,r5,LSL #13 ++ mov r5,r5,LSR #19 ++ add r5,r5,r6,LSL #6 ++ add r6,r7,r8,LSL #25 ++ mov r7,r8,LSR #7 ++ add r7,r7,r9,LSL #19 ++ mov r8,r9,LSR #13 ++ add r8,r8,r10,LSL #12 ++ mov r9,r10,LSR #20 ++ add r1,r9,r1,LSL #6 ++ str r2,[r0],#4 ++ str r3,[r0],#4 ++ str r4,[r0],#4 ++ str r5,[r0],#4 ++ str r6,[r0],#4 ++ str r7,[r0],#4 ++ str r8,[r0],#4 ++ str r1,[r0] ++ ldrd r4,[sp,#0] ++ ldrd r6,[sp,#8] ++ ldrd r8,[sp,#16] ++ ldrd r10,[sp,#24] ++ ldr r12,[sp,#480] ++ ldr r14,[sp,#484] ++ ldr r0,=0 ++ mov sp,r12 ++ vpop {q4,q5,q6,q7} ++ bx lr ++ENDPROC(curve25519_asm_neon) +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/crypto/poly1305-avx2-x86_64.S 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,387 @@ +/* + * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions + * @@ -9590,11 +17426,12 @@ + +#include + -+.data ++.section .rodata.cst32.ANMASK, "aM", @progbits, 32 +.align 32 -+ +ANMASK: .octa 0x0000000003ffffff0000000003ffffff + .octa 0x0000000003ffffff0000000003ffffff ++.section .rodata.cst32.ORMASK, "aM", @progbits, 32 ++.align 32 +ORMASK: .octa 0x00000000010000000000000001000000 + .octa 0x00000000010000000000000001000000 + @@ -9963,9 +17800,9 @@ + pop %rbx + ret +ENDPROC(poly1305_asm_4block_avx2) ---- /dev/null -+++ b/net/wireguard/crypto/poly1305-sse2-x86_64.S -@@ -0,0 +1,582 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/crypto/poly1305-sse2-x86_64.S 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,583 @@ +/* + * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions + * @@ -9979,10 +17816,11 @@ + +#include + -+.data ++.section .rodata.cst16.ANMASK, "aM", @progbits, 16 +.align 16 -+ +ANMASK: .octa 0x0000000003ffffff0000000003ffffff ++.section .rodata.cst16.ORMASK, "aM", @progbits, 16 ++.align 16 +ORMASK: .octa 0x00000000010000000000000001000000 + +.text @@ -10548,58 +18386,55 @@ + pop %rbx + ret +ENDPROC(poly1305_asm_2block_sse2) ---- /dev/null -+++ b/net/wireguard/Makefile +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/Makefile 2017-07-06 18:17:33.000000000 +0200 @@ -0,0 +1,36 @@ +ccflags-y := -O3 -fvisibility=hidden -+ccflags-$(CONFIG_WIREGUARD_DEBUG) := -DDEBUG -g ++ccflags-$(CONFIG_WIREGUARD_DEBUG) += -DDEBUG -g +ccflags-y += -Wframe-larger-than=8192 +ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt' +wireguard-y := main.o noise.o device.o peer.o timers.o data.o send.o receive.o socket.o config.o hashtables.o routingtable.o ratelimiter.o cookie.o +wireguard-y += crypto/curve25519.o crypto/chacha20poly1305.o crypto/blake2s.o ++ +ifeq ($(CONFIG_X86_64),y) + wireguard-y += crypto/chacha20-ssse3-x86_64.o crypto/poly1305-sse2-x86_64.o -+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1$(comma)4)$(comma)%ymm2,yes,no) -+ifeq ($(avx2_supported),yes) -+ wireguard-y += crypto/chacha20-avx2-x86_64.o crypto/poly1305-avx2-x86_64.o ++ avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) ++ ifeq ($(avx_supported),yes) ++ wireguard-y += crypto/blake2s-avx-x86_64.o crypto/curve25519-avx-x86_64.o ++ endif ++ avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1$(comma)4)$(comma)%ymm2,yes,no) ++ ifeq ($(avx2_supported),yes) ++ wireguard-y += crypto/chacha20-avx2-x86_64.o crypto/poly1305-avx2-x86_64.o ++ endif ++endif ++ ++ifeq ($(CONFIG_ARM64),y) ++ wireguard-$(CONFIG_KERNEL_MODE_NEON) += crypto/chacha20-neon-arm64.o ++endif ++ifeq ($(CONFIG_ARM),y) ++ wireguard-$(CONFIG_KERNEL_MODE_NEON) += crypto/chacha20-neon-arm.o crypto/curve25519-neon-arm.o ++endif ++ ++ifneq ($(KBUILD_EXTMOD),) ++CONFIG_WIREGUARD := m ++ifneq ($(CONFIG_SMP),) ++ccflags-y += -DCONFIG_WIREGUARD_PARALLEL=y +endif +endif + +include $(src)/compat/Makefile.include + -+ifneq ($(KBUILD_EXTMOD),) -+CONFIG_WIREGUARD := m -+ifeq ($(CONFIG_WIREGUARD_PARALLEL),) -+ifneq (,$(filter $(CONFIG_PADATA),y m)) -+ccflags-y += -DCONFIG_WIREGUARD_PARALLEL=y -+endif -+endif -+ifneq ($(CONFIG_MODULES),) -+ifeq ($(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT),) -+$(error "WireGuard requires CONFIG_NETFILTER_XT_MATCH_HASHLIMIT to be configured in your kernel. See https://www.wireguard.io/install/#kernel-requirements for more info") -+endif -+ifeq ($(CONFIG_PADATA),) -+ifneq ($(CONFIG_SMP),) -+$(warning "PEFORMANCE WARNING: WireGuard has enormous speed benefits when using CONFIG_PADATA on SMP systems. Please enable CONFIG_PADATA in your kernel configuration. See https://www.wireguard.io/install/#kernel-requirements for more info.") -+endif -+endif -+endif -+endif -+ +obj-$(CONFIG_WIREGUARD) := wireguard.o ---- /dev/null -+++ b/net/wireguard/Kconfig -@@ -0,0 +1,43 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/Kconfig 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,40 @@ +config WIREGUARD + tristate "IP: WireGuard secure network tunnel" + depends on NET && INET + select NET_UDP_TUNNEL -+ select NETFILTER_XT_MATCH_HASHLIMIT -+ select NETFILTER -+ select NETFILTER_XTABLES -+ select NETFILTER_ADVANCED + select CRYPTO_BLKCIPHER -+ select IP6_NF_IPTABLES if IPV6 ++ select NEON ++ select KERNEL_MODE_NEON + default m + ---help--- + WireGuard is a secure, fast, and easy to use replacement for IPSec @@ -10633,22 +18468,12 @@ + only useful for debugging. + + Say N here unless you know what you're doing. ---- /dev/null -+++ b/net/wireguard/compat/Makefile.include -@@ -0,0 +1,11 @@ -+ccflags-y += -include $(src)/compat/compat.h -+ -+ifeq ($(wildcard $(srctree)/include/linux/siphash.h),) -+ccflags-y += -I$(src)/compat/siphash/include -+wireguard-y += compat/siphash/siphash.o -+endif -+ -+ifeq ($(wildcard $(srctree)/include/net/dst_cache.h),) -+ccflags-y += -I$(src)/compat/dst_cache/include -+wireguard-y += compat/dst_cache/dst_cache.o -+endif ---- /dev/null -+++ b/net/wireguard/compat/siphash/include/linux/siphash.h +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/compat/fpu/include/asm/fpu/api.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1 @@ ++#include +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/compat/siphash/include/linux/siphash.h 2017-07-06 18:17:33.000000000 +0200 @@ -0,0 +1,140 @@ +/* Copyright (C) 2016 Jason A. Donenfeld . All Rights Reserved. + * @@ -10790,9 +18615,9 @@ +} + +#endif /* _LINUX_SIPHASH_H */ ---- /dev/null -+++ b/net/wireguard/compat/siphash/siphash.c -@@ -0,0 +1,551 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/compat/siphash/siphash.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,539 @@ +/* Copyright (C) 2016 Jason A. Donenfeld . All Rights Reserved. + * + * This file is provided under a dual BSD/GPLv2 license. @@ -10808,6 +18633,14 @@ +#include +#include + ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0) ++#ifdef __LITTLE_ENDIAN ++#define bytemask_from_count(cnt) (~(~0ul << (cnt)*8)) ++#else ++#define bytemask_from_count(cnt) (~(~0ul >> (cnt)*8)) ++#endif ++#endif ++ +#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 +#include +#include @@ -10874,7 +18707,6 @@ +#endif + POSTAMBLE +} -+EXPORT_SYMBOL(__siphash_aligned); + +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) @@ -10907,7 +18739,6 @@ +#endif + POSTAMBLE +} -+EXPORT_SYMBOL(__siphash_unaligned); +#endif + +/** @@ -10924,7 +18755,6 @@ + v0 ^= first; + POSTAMBLE +} -+EXPORT_SYMBOL(siphash_1u64); + +/** + * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64 @@ -10945,7 +18775,6 @@ + v0 ^= second; + POSTAMBLE +} -+EXPORT_SYMBOL(siphash_2u64); + +/** + * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64 @@ -10972,7 +18801,6 @@ + v0 ^= third; + POSTAMBLE +} -+EXPORT_SYMBOL(siphash_3u64); + +/** + * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64 @@ -11004,7 +18832,6 @@ + v0 ^= forth; + POSTAMBLE +} -+EXPORT_SYMBOL(siphash_4u64); + +u64 siphash_1u32(const u32 first, const siphash_key_t *key) +{ @@ -11012,7 +18839,6 @@ + b |= first; + POSTAMBLE +} -+EXPORT_SYMBOL(siphash_1u32); + +u64 siphash_3u32(const u32 first, const u32 second, const u32 third, + const siphash_key_t *key) @@ -11026,7 +18852,6 @@ + b |= third; + POSTAMBLE +} -+EXPORT_SYMBOL(siphash_3u32); + +#if BITS_PER_LONG == 64 +/* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for @@ -11074,7 +18899,6 @@ +#endif + HPOSTAMBLE +} -+EXPORT_SYMBOL(__hsiphash_aligned); + +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +u32 __hsiphash_unaligned(const void *data, size_t len, @@ -11107,7 +18931,6 @@ +#endif + HPOSTAMBLE +} -+EXPORT_SYMBOL(__hsiphash_unaligned); +#endif + +/** @@ -11121,7 +18944,6 @@ + b |= first; + HPOSTAMBLE +} -+EXPORT_SYMBOL(hsiphash_1u32); + +/** + * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 @@ -11138,7 +18960,6 @@ + v0 ^= combined; + HPOSTAMBLE +} -+EXPORT_SYMBOL(hsiphash_2u32); + +/** + * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 @@ -11158,7 +18979,6 @@ + b |= third; + HPOSTAMBLE +} -+EXPORT_SYMBOL(hsiphash_3u32); + +/** + * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 @@ -11182,7 +19002,6 @@ + v0 ^= combined; + HPOSTAMBLE +} -+EXPORT_SYMBOL(hsiphash_4u32); +#else +#define HSIPROUND \ + do { \ @@ -11232,7 +19051,6 @@ + } + HPOSTAMBLE +} -+EXPORT_SYMBOL(__hsiphash_aligned); + +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +u32 __hsiphash_unaligned(const void *data, size_t len, @@ -11255,7 +19073,6 @@ + } + HPOSTAMBLE +} -+EXPORT_SYMBOL(__hsiphash_unaligned); +#endif + +/** @@ -11271,7 +19088,6 @@ + v0 ^= first; + HPOSTAMBLE +} -+EXPORT_SYMBOL(hsiphash_1u32); + +/** + * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 @@ -11290,7 +19106,6 @@ + v0 ^= second; + HPOSTAMBLE +} -+EXPORT_SYMBOL(hsiphash_2u32); + +/** + * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 @@ -11314,7 +19129,6 @@ + v0 ^= third; + HPOSTAMBLE +} -+EXPORT_SYMBOL(hsiphash_3u32); + +/** + * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 @@ -11342,72 +19156,573 @@ + v0 ^= forth; + HPOSTAMBLE +} -+EXPORT_SYMBOL(hsiphash_4u32); +#endif ---- /dev/null -+++ b/net/wireguard/compat/compat.h -@@ -0,0 +1,198 @@ -+/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/compat/Makefile.include 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,41 @@ ++ifeq ($(wildcard $(src)/compat/compat.h),) ++ccflags-y += -include $(srctree)/$(src)/compat/compat.h ++else ++ccflags-y += -include $(src)/compat/compat.h ++endif + -+#ifndef COMPAT_H -+#define COMPAT_H ++ifeq ($(wildcard $(srctree)/include/linux/siphash.h),) ++ccflags-y += -I$(src)/compat/siphash/include ++wireguard-y += compat/siphash/siphash.o ++endif + -+#include -+#include ++ifeq ($(wildcard $(srctree)/include/net/dst_cache.h),) ++ccflags-y += -I$(src)/compat/dst_cache/include ++wireguard-y += compat/dst_cache/dst_cache.o ++endif ++ ++ifeq ($(wildcard $(srctree)/arch/x86/include/asm/fpu/api.h),) ++ccflags-y += -I$(src)/compat/fpu/include ++endif ++ ++ifeq ($(wildcard $(srctree)/arch/x86/include/asm/simd.h),) ++ccflags-y += -I$(src)/compat/simd/include ++endif ++ ++ifeq ($(wildcard $(srctree)/include/net/udp_tunnel.h),) ++ccflags-y += -I$(src)/compat/udp_tunnel/include ++wireguard-y += compat/udp_tunnel/udp_tunnel.o ++endif ++ ++ifeq ($(shell grep -F "int crypto_memneq" "$(srctree)/include/crypto/algapi.h"),) ++ccflags-y += -include $(src)/compat/memneq/include.h ++wireguard-y += compat/memneq/memneq.o ++endif ++ ++ifneq ($(KBUILD_EXTMOD),) ++ifneq ($(CONFIG_SMP),) ++ifeq (,$(filter $(CONFIG_PADATA),y m)) ++wireguard-y += compat/padata/padata.o ++endif ++endif ++endif +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/compat/udp_tunnel/udp_tunnel.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,380 @@ ++#include ++#include ++#include ++#include +#include ++#include ++#include ++#include ++#include ++#include + ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 12, 0) ++#define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) ++#define rcu_dereference_sk_user_data(sk) rcu_dereference(__sk_user_data((sk))) ++#define rcu_assign_sk_user_data(sk, ptr) rcu_assign_pointer(__sk_user_data((sk)), ptr) ++#endif ++ ++/* This is global so, uh, only one real call site... This is the kind of horrific hack you'd expect to see in compat code. */ ++static udp_tunnel_encap_rcv_t encap_rcv = NULL; ++static void our_sk_data_ready(struct sock *sk ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0) ++ ,int unused_vulnerable_length_param ++#endif ++ ) ++{ ++ struct sk_buff *skb; ++ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { ++ skb_orphan(skb); ++ sk_mem_reclaim(sk); ++ encap_rcv(sk, skb); ++ } ++} ++ ++int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, ++ struct socket **sockp) ++{ ++ int err; ++ struct socket *sock = NULL; ++ struct sockaddr_in udp_addr; ++ ++ err = __sock_create(net, AF_INET, SOCK_DGRAM, 0, &sock, 1); ++ if (err < 0) ++ goto error; ++ ++ udp_addr.sin_family = AF_INET; ++ udp_addr.sin_addr = cfg->local_ip; ++ udp_addr.sin_port = cfg->local_udp_port; ++ err = kernel_bind(sock, (struct sockaddr *)&udp_addr, ++ sizeof(udp_addr)); ++ if (err < 0) ++ goto error; ++ ++ if (cfg->peer_udp_port) { ++ udp_addr.sin_family = AF_INET; ++ udp_addr.sin_addr = cfg->peer_ip; ++ udp_addr.sin_port = cfg->peer_udp_port; ++ err = kernel_connect(sock, (struct sockaddr *)&udp_addr, ++ sizeof(udp_addr), 0); ++ if (err < 0) ++ goto error; ++ } ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0) ++ sock->sk->sk_no_check = !cfg->use_udp_checksums; ++#else ++ sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; ++#endif ++ ++ *sockp = sock; ++ return 0; ++ ++error: ++ if (sock) { ++ kernel_sock_shutdown(sock, SHUT_RDWR); ++ sock_release(sock); ++ } ++ *sockp = NULL; ++ return err; ++} ++ ++void setup_udp_tunnel_sock(struct net *net, struct socket *sock, ++ struct udp_tunnel_sock_cfg *cfg) ++{ ++ inet_sk(sock->sk)->mc_loop = 0; ++ encap_rcv = cfg->encap_rcv; ++ rcu_assign_sk_user_data(sock->sk, cfg->sk_user_data); ++ sock->sk->sk_data_ready = our_sk_data_ready; ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0) ++static inline __sum16 udp_v4_check(int len, __be32 saddr, ++ __be32 daddr, __wsum base) ++{ ++ return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base); ++} ++ ++static void udp_set_csum(bool nocheck, struct sk_buff *skb, ++ __be32 saddr, __be32 daddr, int len) ++{ ++ struct udphdr *uh = udp_hdr(skb); ++ ++ if (nocheck) ++ uh->check = 0; ++ else if (skb_is_gso(skb)) ++ uh->check = ~udp_v4_check(len, saddr, daddr, 0); ++ else if (skb_dst(skb) && skb_dst(skb)->dev && ++ (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) { ++ ++ BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); ++ ++ skb->ip_summed = CHECKSUM_PARTIAL; ++ skb->csum_start = skb_transport_header(skb) - skb->head; ++ skb->csum_offset = offsetof(struct udphdr, check); ++ uh->check = ~udp_v4_check(len, saddr, daddr, 0); ++ } else { ++ __wsum csum; ++ ++ BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); ++ ++ uh->check = 0; ++ csum = skb_checksum(skb, 0, len, 0); ++ uh->check = udp_v4_check(len, saddr, daddr, csum); ++ if (uh->check == 0) ++ uh->check = CSUM_MANGLED_0; ++ ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ } ++} ++ ++#endif ++ ++static void fake_destructor(struct sk_buff *skb) ++{ ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 12, 0) ++static void our_iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, ++ __be32 src, __be32 dst, __u8 proto, ++ __u8 tos, __u8 ttl, __be16 df, bool xnet) ++{ ++ struct iphdr *iph; ++ ++ skb_scrub_packet(skb, xnet); ++ ++ skb->rxhash = 0; ++ skb_dst_set(skb, &rt->dst); ++ memset(IPCB(skb), 0, sizeof(*IPCB(skb))); ++ ++ /* Push down and install the IP header. */ ++ skb_push(skb, sizeof(struct iphdr)); ++ skb_reset_network_header(skb); ++ ++ iph = ip_hdr(skb); ++ ++ iph->version = 4; ++ iph->ihl = sizeof(struct iphdr) >> 2; ++ iph->frag_off = df; ++ iph->protocol = proto; ++ iph->tos = tos; ++ iph->daddr = dst; ++ iph->saddr = src; ++ iph->ttl = ttl; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 53) ++ __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); ++#else ++ __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1); ++#endif ++ ++ iptunnel_xmit(skb, skb->dev); ++} ++#define iptunnel_xmit our_iptunnel_xmit ++#endif ++ ++void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, ++ __be32 src, __be32 dst, __u8 tos, __u8 ttl, ++ __be16 df, __be16 src_port, __be16 dst_port, ++ bool xnet, bool nocheck) ++{ ++ struct udphdr *uh; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 12, 0) ++ struct net_device *dev = skb->dev; ++ int ret; ++#endif ++ ++ __skb_push(skb, sizeof(*uh)); ++ skb_reset_transport_header(skb); ++ uh = udp_hdr(skb); ++ ++ uh->dest = dst_port; ++ uh->source = src_port; ++ uh->len = htons(skb->len); ++ ++ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); ++ ++ udp_set_csum(nocheck, skb, src, dst, skb->len); ++ ++ if (!skb->sk) ++ skb->sk = sk; ++ if (!skb->destructor) ++ skb->destructor = fake_destructor; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 12, 0) ++ ret = ++#endif ++ iptunnel_xmit( ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 15, 0) ++ sk, ++#endif ++ rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 12, 0) ++ iptunnel_xmit_stats(ret, &dev->stats, dev->tstats); ++#endif ++} ++ ++void udp_tunnel_sock_release(struct socket *sock) ++{ ++ rcu_assign_sk_user_data(sock->sk, NULL); ++ kernel_sock_shutdown(sock, SHUT_RDWR); ++ sock_release(sock); ++} ++ ++#if IS_ENABLED(CONFIG_IPV6) ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, ++ struct socket **sockp) ++{ ++ struct sockaddr_in6 udp6_addr; ++ int err; ++ struct socket *sock = NULL; ++ ++ err = __sock_create(net, AF_INET6, SOCK_DGRAM, 0, &sock, 1); ++ if (err < 0) ++ goto error; ++ ++ if (cfg->ipv6_v6only) { ++ int val = 1; ++ ++ err = kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, ++ (char *) &val, sizeof(val)); ++ if (err < 0) ++ goto error; ++ } ++ ++ udp6_addr.sin6_family = AF_INET6; ++ memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, ++ sizeof(udp6_addr.sin6_addr)); ++ udp6_addr.sin6_port = cfg->local_udp_port; ++ err = kernel_bind(sock, (struct sockaddr *)&udp6_addr, ++ sizeof(udp6_addr)); ++ if (err < 0) ++ goto error; ++ ++ if (cfg->peer_udp_port) { ++ udp6_addr.sin6_family = AF_INET6; ++ memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6, ++ sizeof(udp6_addr.sin6_addr)); ++ udp6_addr.sin6_port = cfg->peer_udp_port; ++ err = kernel_connect(sock, ++ (struct sockaddr *)&udp6_addr, ++ sizeof(udp6_addr), 0); ++ } ++ if (err < 0) ++ goto error; ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0) ++ sock->sk->sk_no_check = !cfg->use_udp_checksums; ++#else ++ udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums); ++ udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums); ++#endif ++ ++ *sockp = sock; ++ return 0; ++ ++error: ++ if (sock) { ++ kernel_sock_shutdown(sock, SHUT_RDWR); ++ sock_release(sock); ++ } ++ *sockp = NULL; ++ return err; ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0) ++static inline __sum16 udp_v6_check(int len, ++ const struct in6_addr *saddr, ++ const struct in6_addr *daddr, ++ __wsum base) ++{ ++ return csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, base); ++} ++static void udp6_set_csum(bool nocheck, struct sk_buff *skb, ++ const struct in6_addr *saddr, ++ const struct in6_addr *daddr, int len) ++{ ++ struct udphdr *uh = udp_hdr(skb); ++ ++ if (nocheck) ++ uh->check = 0; ++ else if (skb_is_gso(skb)) ++ uh->check = ~udp_v6_check(len, saddr, daddr, 0); ++ else if (skb_dst(skb) && skb_dst(skb)->dev && ++ (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) { ++ ++ BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); ++ ++ skb->ip_summed = CHECKSUM_PARTIAL; ++ skb->csum_start = skb_transport_header(skb) - skb->head; ++ skb->csum_offset = offsetof(struct udphdr, check); ++ uh->check = ~udp_v6_check(len, saddr, daddr, 0); ++ } else { ++ __wsum csum; ++ ++ BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); ++ ++ uh->check = 0; ++ csum = skb_checksum(skb, 0, len, 0); ++ uh->check = udp_v6_check(len, saddr, daddr, csum); ++ if (uh->check == 0) ++ uh->check = CSUM_MANGLED_0; ++ ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ } ++} ++#endif ++ ++int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, ++ struct sk_buff *skb, ++ struct net_device *dev, struct in6_addr *saddr, ++ struct in6_addr *daddr, ++ __u8 prio, __u8 ttl, __be32 label, ++ __be16 src_port, __be16 dst_port, bool nocheck) ++{ ++ struct udphdr *uh; ++ struct ipv6hdr *ip6h; ++ ++ __skb_push(skb, sizeof(*uh)); ++ skb_reset_transport_header(skb); ++ uh = udp_hdr(skb); ++ ++ uh->dest = dst_port; ++ uh->source = src_port; ++ ++ uh->len = htons(skb->len); ++ ++ skb_dst_set(skb, dst); ++ ++ udp6_set_csum(nocheck, skb, saddr, daddr, skb->len); ++ ++ __skb_push(skb, sizeof(*ip6h)); ++ skb_reset_network_header(skb); ++ ip6h = ipv6_hdr(skb); ++ ip6_flow_hdr(ip6h, prio, label); ++ ip6h->payload_len = htons(skb->len); ++ ip6h->nexthdr = IPPROTO_UDP; ++ ip6h->hop_limit = ttl; ++ ip6h->daddr = *daddr; ++ ip6h->saddr = *saddr; ++ ++ if (!skb->sk) ++ skb->sk = sk; ++ if (!skb->destructor) ++ skb->destructor = fake_destructor; ++ ++ ip6tunnel_xmit(skb, dev); ++ return 0; ++} ++#endif +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/compat/udp_tunnel/udp_tunnel_partial_compat.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,215 @@ +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0) -+#error "WireGuard requires Linux >= 3.18" ++#define udp_sock_create4 udp_sock_create ++#define udp_sock_create6 udp_sock_create ++#include ++#include ++#include ++#include ++#include ++#include ++#if IS_ENABLED(CONFIG_IPV6) ++#include ++#include ++#include ++#include ++#include ++#endif ++static inline void fake_destructor(struct sk_buff *skb) ++{ ++} ++typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); ++struct udp_tunnel_sock_cfg { ++ void *sk_user_data; ++ __u8 encap_type; ++ udp_tunnel_encap_rcv_t encap_rcv; ++}; ++/* This is global so, uh, only one real call site... This is the kind of horrific hack you'd expect to see in compat code. */ ++static udp_tunnel_encap_rcv_t encap_rcv = NULL; ++static void our_sk_data_ready(struct sock *sk) ++{ ++ struct sk_buff *skb; ++ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { ++ skb_orphan(skb); ++ sk_mem_reclaim(sk); ++ encap_rcv(sk, skb); ++ } ++} ++static inline void setup_udp_tunnel_sock(struct net *net, struct socket *sock, ++ struct udp_tunnel_sock_cfg *cfg) ++{ ++ struct sock *sk = sock->sk; ++ inet_sk(sk)->mc_loop = 0; ++ encap_rcv = cfg->encap_rcv; ++ rcu_assign_sk_user_data(sk, cfg->sk_user_data); ++ sk->sk_data_ready = our_sk_data_ready; ++} ++static inline void udp_tunnel_sock_release(struct socket *sock) ++{ ++ rcu_assign_sk_user_data(sock->sk, NULL); ++ kernel_sock_shutdown(sock, SHUT_RDWR); ++ sk_release_kernel(sock->sk); ++} ++static inline int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt, ++ struct sk_buff *skb, __be32 src, __be32 dst, ++ __u8 tos, __u8 ttl, __be16 df, __be16 src_port, ++ __be16 dst_port, bool xnet) ++{ ++ struct udphdr *uh; ++ __skb_push(skb, sizeof(*uh)); ++ skb_reset_transport_header(skb); ++ uh = udp_hdr(skb); ++ uh->dest = dst_port; ++ uh->source = src_port; ++ uh->len = htons(skb->len); ++ udp_set_csum(sock->sk->sk_no_check_tx, skb, src, dst, skb->len); ++ return iptunnel_xmit(sock->sk, rt, skb, src, dst, IPPROTO_UDP, ++ tos, ttl, df, xnet); ++} ++#if IS_ENABLED(CONFIG_IPV6) ++static inline int udp_tunnel6_xmit_skb(struct socket *sock, struct dst_entry *dst, ++ struct sk_buff *skb, struct net_device *dev, ++ struct in6_addr *saddr, struct in6_addr *daddr, ++ __u8 prio, __u8 ttl, __be16 src_port, ++ __be16 dst_port) ++{ ++ struct udphdr *uh; ++ struct ipv6hdr *ip6h; ++ struct sock *sk = sock->sk; ++ __skb_push(skb, sizeof(*uh)); ++ skb_reset_transport_header(skb); ++ uh = udp_hdr(skb); ++ uh->dest = dst_port; ++ uh->source = src_port; ++ uh->len = htons(skb->len); ++ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); ++ IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED ++ | IPSKB_REROUTED); ++ skb_dst_set(skb, dst); ++ udp6_set_csum(udp_get_no_check6_tx(sk), skb, &inet6_sk(sk)->saddr, ++ &sk->sk_v6_daddr, skb->len); ++ __skb_push(skb, sizeof(*ip6h)); ++ skb_reset_network_header(skb); ++ ip6h = ipv6_hdr(skb); ++ ip6_flow_hdr(ip6h, prio, htonl(0)); ++ ip6h->payload_len = htons(skb->len); ++ ip6h->nexthdr = IPPROTO_UDP; ++ ip6h->hop_limit = ttl; ++ ip6h->daddr = *daddr; ++ ip6h->saddr = *saddr; ++ ip6tunnel_xmit(skb, dev); ++ return 0; ++} ++#endif +#endif + -+/* These conditionals can't be enforced by an out of tree module very easily, -+ * so we stick them here in compat instead. */ -+#if !IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) -+#error "WireGuard requires CONFIG_NETFILTER_XT_MATCH_HASHLIMIT." -+#endif -+#if IS_ENABLED(CONFIG_IPV6) && !IS_ENABLED(CONFIG_IP6_NF_IPTABLES) -+#error "WireGuard requires CONFIG_IP6_NF_IPTABLES when using CONFIG_IPV6." -+#endif -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) && !defined(DEBUG) && defined(net_dbg_ratelimited) -+#undef net_dbg_ratelimited -+#define net_dbg_ratelimited(fmt, ...) do { if (0) no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); } while (0) -+#endif -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) -+#define RCU_LOCKDEP_WARN(cond, message) rcu_lockdep_assert(!(cond), message) -+#endif -+ -+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3, 19, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 6)) || LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 12) -+#define dev_recursion_level() 0 -+#endif -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0) ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0) ++#include ++#include ++#include ++#include +#include +#include +#define udp_tunnel_xmit_skb(a, b, c, d, e, f, g, h, i, j, k, l) do { struct net_device *dev__ = (c)->dev; int ret__; ret__ = udp_tunnel_xmit_skb((b)->sk_socket, a, c, d, e, f, g, h, i, j, k); iptunnel_xmit_stats(ret__, &dev__->stats, dev__->tstats); } while (0) +#if IS_ENABLED(CONFIG_IPV6) +#define udp_tunnel6_xmit_skb(a, b, c, d, e, f, g, h, i, j, k, l) udp_tunnel6_xmit_skb((b)->sk_socket, a, c, d, e, f, g, h, j, k); +#endif -+#elif LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 0) ++#elif LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0) +#include +#include ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) +static inline void fake_destructor(struct sk_buff *skb) +{ +} -+#define udp_tunnel_xmit_skb(a, b, c, d, e, f, g, h, i, j, k, l) do { struct net_device *dev__ = (c)->dev; int ret__; (c)->destructor = fake_destructor; (c)->sk = (b); ret__ = udp_tunnel_xmit_skb(a, c, d, e, f, g, h, i, j, k, l); iptunnel_xmit_stats(ret__, &dev__->stats, dev__->tstats); } while (0) ++#endif ++#define udp_tunnel_xmit_skb(a, b, c, d, e, f, g, h, i, j, k, l) do { struct net_device *dev__ = (c)->dev; int ret__; if (!(c)->destructor) (c)->destructor = fake_destructor; if (!(c)->sk) (c)->sk = (b); ret__ = udp_tunnel_xmit_skb(a, c, d, e, f, g, h, i, j, k, l); iptunnel_xmit_stats(ret__, &dev__->stats, dev__->tstats); } while (0) +#if IS_ENABLED(CONFIG_IPV6) -+#define udp_tunnel6_xmit_skb(a, b, c, d, e, f, g, h, i, j, k, l) do { (c)->destructor = fake_destructor; (c)->sk = (b); udp_tunnel6_xmit_skb(a, c, d, e, f, g, h, j, k, l); } while(0) ++#define udp_tunnel6_xmit_skb(a, b, c, d, e, f, g, h, i, j, k, l) do { if (!(c)->destructor) (c)->destructor = fake_destructor; if (!(c)->sk) (c)->sk = (b); udp_tunnel6_xmit_skb(a, c, d, e, f, g, h, j, k, l); } while(0) +#endif +#else + -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0) ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0) +#include +#include +#define udp_tunnel_xmit_skb(a, b, c, d, e, f, g, h, i, j, k, l) do { struct net_device *dev__ = (c)->dev; int ret__ = udp_tunnel_xmit_skb(a, b, c, d, e, f, g, h, i, j, k, l); iptunnel_xmit_stats(ret__, &dev__->stats, dev__->tstats); } while (0) +#endif + -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 6, 0) && IS_ENABLED(CONFIG_IPV6) ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 6, 0) && IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0) +#include +#include +#define udp_tunnel6_xmit_skb(a, b, c, d, e, f, g, h, i, j, k, l) udp_tunnel6_xmit_skb(a, b, c, d, e, f, g, h, j, k, l) @@ -11415,7 +19730,8 @@ + +#endif + -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0) ++#include +#include +#include +struct udp_port_cfg_new { @@ -11486,12 +19802,358 @@ +#define udp_port_cfg udp_port_cfg_new +#define udp_sock_create(a, b, c) udp_sock_create_new(a, b, c) +#endif +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/compat/udp_tunnel/include/net/udp_tunnel.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,94 @@ ++#ifndef __NET_UDP_TUNNEL_H ++#define __NET_UDP_TUNNEL_H ++ ++#include ++#include ++ ++#if IS_ENABLED(CONFIG_IPV6) ++#include ++#include ++#endif ++ ++struct udp_port_cfg { ++ u8 family; ++ ++ /* Used only for kernel-created sockets */ ++ union { ++ struct in_addr local_ip; ++#if IS_ENABLED(CONFIG_IPV6) ++ struct in6_addr local_ip6; ++#endif ++ }; ++ ++ union { ++ struct in_addr peer_ip; ++#if IS_ENABLED(CONFIG_IPV6) ++ struct in6_addr peer_ip6; ++#endif ++ }; ++ ++ __be16 local_udp_port; ++ __be16 peer_udp_port; ++ unsigned int use_udp_checksums:1, ++ use_udp6_tx_checksums:1, ++ use_udp6_rx_checksums:1, ++ ipv6_v6only:1; ++}; ++ ++int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, ++ struct socket **sockp); ++ ++#if IS_ENABLED(CONFIG_IPV6) ++int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, ++ struct socket **sockp); ++#else ++static inline int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, ++ struct socket **sockp) ++{ ++ return 0; ++} ++#endif ++ ++static inline int udp_sock_create(struct net *net, ++ struct udp_port_cfg *cfg, ++ struct socket **sockp) ++{ ++ if (cfg->family == AF_INET) ++ return udp_sock_create4(net, cfg, sockp); ++ ++ if (cfg->family == AF_INET6) ++ return udp_sock_create6(net, cfg, sockp); ++ ++ return -EPFNOSUPPORT; ++} ++ ++typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); ++ ++struct udp_tunnel_sock_cfg { ++ void *sk_user_data; ++ __u8 encap_type; ++ udp_tunnel_encap_rcv_t encap_rcv; ++}; ++ ++/* Setup the given (UDP) sock to receive UDP encapsulated packets */ ++void setup_udp_tunnel_sock(struct net *net, struct socket *sock, ++ struct udp_tunnel_sock_cfg *sock_cfg); ++ ++/* Transmit the skb using UDP encapsulation. */ ++void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, ++ __be32 src, __be32 dst, __u8 tos, __u8 ttl, ++ __be16 df, __be16 src_port, __be16 dst_port, ++ bool xnet, bool nocheck); ++ ++#if IS_ENABLED(CONFIG_IPV6) ++int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, ++ struct sk_buff *skb, ++ struct net_device *dev, struct in6_addr *saddr, ++ struct in6_addr *daddr, ++ __u8 prio, __u8 ttl, __be32 label, ++ __be16 src_port, __be16 dst_port, bool nocheck); ++#endif ++ ++void udp_tunnel_sock_release(struct socket *sock); ++ ++#endif +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/compat/memneq/include.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,5 @@ ++extern noinline unsigned long __crypto_memneq(const void *a, const void *b, size_t size); ++static inline int crypto_memneq(const void *a, const void *b, size_t size) ++{ ++ return __crypto_memneq(a, b, size) != 0UL ? 1 : 0; ++} +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/compat/memneq/memneq.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,170 @@ ++/* ++ * Constant-time equality testing of memory regions. ++ * ++ * Authors: ++ * ++ * James Yonan ++ * Daniel Borkmann ++ * ++ * This file is provided under a dual BSD/GPLv2 license. When using or ++ * redistributing this file, you may do so under either license. ++ * ++ * GPL LICENSE SUMMARY ++ * ++ * Copyright(c) 2013 OpenVPN Technologies, Inc. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of version 2 of the GNU General Public License as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. ++ * The full GNU General Public License is included in this distribution ++ * in the file called LICENSE.GPL. ++ * ++ * BSD LICENSE ++ * ++ * Copyright(c) 2013 OpenVPN Technologies, Inc. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * * Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * * Neither the name of OpenVPN Technologies nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++ ++/* Make the optimizer believe the variable can be manipulated arbitrarily. */ ++#define COMPILER_OPTIMIZER_HIDE_VAR(var) __asm__ ("" : "=r" (var) : "0" (var)) ++ ++#ifndef __HAVE_ARCH_CRYPTO_MEMNEQ ++ ++/* Generic path for arbitrary size */ ++static inline unsigned long ++__crypto_memneq_generic(const void *a, const void *b, size_t size) ++{ ++ unsigned long neq = 0; ++ ++#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ++ while (size >= sizeof(unsigned long)) { ++ neq |= *(unsigned long *)a ^ *(unsigned long *)b; ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ a += sizeof(unsigned long); ++ b += sizeof(unsigned long); ++ size -= sizeof(unsigned long); ++ } ++#endif /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ ++ while (size > 0) { ++ neq |= *(unsigned char *)a ^ *(unsigned char *)b; ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ a += 1; ++ b += 1; ++ size -= 1; ++ } ++ return neq; ++} ++ ++/* Loop-free fast-path for frequently used 16-byte size */ ++static inline unsigned long __crypto_memneq_16(const void *a, const void *b) ++{ ++ unsigned long neq = 0; ++ ++#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS ++ if (sizeof(unsigned long) == 8) { ++ neq |= *(unsigned long *)(a) ^ *(unsigned long *)(b); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned long *)(a+8) ^ *(unsigned long *)(b+8); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ } else if (sizeof(unsigned int) == 4) { ++ neq |= *(unsigned int *)(a) ^ *(unsigned int *)(b); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned int *)(a+4) ^ *(unsigned int *)(b+4); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned int *)(a+8) ^ *(unsigned int *)(b+8); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned int *)(a+12) ^ *(unsigned int *)(b+12); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ } else ++#endif /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ ++ { ++ neq |= *(unsigned char *)(a) ^ *(unsigned char *)(b); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned char *)(a+1) ^ *(unsigned char *)(b+1); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned char *)(a+2) ^ *(unsigned char *)(b+2); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned char *)(a+3) ^ *(unsigned char *)(b+3); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned char *)(a+4) ^ *(unsigned char *)(b+4); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned char *)(a+5) ^ *(unsigned char *)(b+5); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned char *)(a+6) ^ *(unsigned char *)(b+6); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned char *)(a+7) ^ *(unsigned char *)(b+7); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned char *)(a+8) ^ *(unsigned char *)(b+8); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned char *)(a+9) ^ *(unsigned char *)(b+9); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned char *)(a+10) ^ *(unsigned char *)(b+10); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned char *)(a+11) ^ *(unsigned char *)(b+11); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned char *)(a+12) ^ *(unsigned char *)(b+12); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned char *)(a+13) ^ *(unsigned char *)(b+13); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned char *)(a+14) ^ *(unsigned char *)(b+14); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ neq |= *(unsigned char *)(a+15) ^ *(unsigned char *)(b+15); ++ COMPILER_OPTIMIZER_HIDE_VAR(neq); ++ } ++ ++ return neq; ++} ++ ++/* Compare two areas of memory without leaking timing information, ++ * and with special optimizations for common sizes. Users should ++ * not call this function directly, but should instead use ++ * crypto_memneq defined in crypto/algapi.h. ++ */ ++noinline unsigned long __crypto_memneq(const void *a, const void *b, ++ size_t size) ++{ ++ switch (size) { ++ case 16: ++ return __crypto_memneq_16(a, b); ++ default: ++ return __crypto_memneq_generic(a, b, size); ++ } ++} ++ ++#endif /* __HAVE_ARCH_CRYPTO_MEMNEQ */ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/compat/compat.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,399 @@ ++/* Copyright (C) 2015-2017 Jason A. Donenfeld . All Rights Reserved. */ ++ ++#ifndef COMPAT_H ++#define COMPAT_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef RHEL_MAJOR ++#if RHEL_MAJOR == 7 ++#define ISRHEL7 ++#endif ++#endif ++#ifdef UTS_UBUNTU_RELEASE_ABI ++#if LINUX_VERSION_CODE == KERNEL_VERSION(3, 13, 11) ++#define ISUBUNTU1404 ++#endif ++#endif ++#ifdef CONFIG_SUSE_KERNEL ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0) ++#define ISOPENSUSE42 ++#endif ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 0) ++#error "WireGuard requires Linux >= 3.10" ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0) && defined(CONFIG_X86_64) ++#define CONFIG_AS_SSSE3 ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0) && !defined(ISRHEL7) ++#define headers_start data ++#define headers_end data ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0) ++#include "udp_tunnel/udp_tunnel_partial_compat.h" ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) && !defined(DEBUG) && defined(net_dbg_ratelimited) ++#undef net_dbg_ratelimited ++#define net_dbg_ratelimited(fmt, ...) do { if (0) no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); } while (0) ++#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) ++#define RCU_LOCKDEP_WARN(cond, message) rcu_lockdep_assert(!(cond), message) ++#endif ++ ++#if ((LINUX_VERSION_CODE > KERNEL_VERSION(3, 19, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 6)) || \ ++ (LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 12) && LINUX_VERSION_CODE > KERNEL_VERSION(3, 17, 0)) || \ ++ (LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 8) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 15, 0)) || \ ++ LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 40)) && !defined(ISRHEL7) && !defined(ISUBUNTU1404) ++#define dev_recursion_level() 0 ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) && !defined(ISRHEL7) +#define ipv6_dst_lookup(a, b, c, d) ipv6_dst_lookup(b, c, d) +#endif + -+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 5) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)) || (LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 17) && LINUX_VERSION_CODE > KERNEL_VERSION(3, 19, 0)) || LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 27) ++#if ((LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 5) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)) || \ ++ (LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 17) && LINUX_VERSION_CODE > KERNEL_VERSION(3, 19, 0)) || \ ++ (LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 27) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)) || \ ++ (LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 8) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 15, 0)) || \ ++ (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 40) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) || \ ++ (LINUX_VERSION_CODE < KERNEL_VERSION(3, 12, 54))) && !defined(ISUBUNTU1404) ++#include ++#include +#define IP6_ECN_set_ce(a, b) IP6_ECN_set_ce(b) +#endif + @@ -11502,7 +20164,21 @@ +#define time_is_after_eq_jiffies64(a) time_before_eq64(get_jiffies_64(), a) +#endif + -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) && IS_ENABLED(CONFIG_IPV6) ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 12, 0) && IS_ENABLED(CONFIG_IPV6) && !defined(ISRHEL7) ++#include ++struct ipv6_stub_type { ++ void *udpv6_encap_enable; ++ int (*ipv6_dst_lookup)(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); ++}; ++static const struct ipv6_stub_type ipv6_stub_impl = { ++ .udpv6_encap_enable = (void *)1, ++ .ipv6_dst_lookup = ip6_dst_lookup ++}; ++static const struct ipv6_stub_type *ipv6_stub = &ipv6_stub_impl; ++#endif ++ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) && IS_ENABLED(CONFIG_IPV6) && !defined(ISOPENSUSE42) +#include +static inline bool ipv6_mod_enabled(void) +{ @@ -11520,6 +20196,267 @@ +} +#endif + ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0) ++#include ++static inline u32 get_random_u32(void) ++{ ++ static siphash_key_t key; ++ static u32 counter = 0; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) ++ static bool has_seeded = false; ++ if (unlikely(!has_seeded)) { ++ get_random_bytes(&key, sizeof(key)); ++ has_seeded = true; ++ } ++#else ++ get_random_once(&key, sizeof(key)); ++#endif ++ return siphash_2u32(counter++, get_random_int(), &key); ++} ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0) && !defined(ISRHEL7) ++static inline void netif_keep_dst(struct net_device *dev) ++{ ++ dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; ++} ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0) && !defined(ISRHEL7) ++#define pcpu_sw_netstats pcpu_tstats ++#define netdev_alloc_pcpu_stats alloc_percpu ++#elif LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0) && !defined(ISRHEL7) ++#define netdev_alloc_pcpu_stats(type) \ ++({ \ ++ typeof(type) __percpu *pcpu_stats = alloc_percpu(type); \ ++ if (pcpu_stats) { \ ++ int __cpu; \ ++ for_each_possible_cpu (__cpu) { \ ++ typeof(type) *stat; \ ++ stat = per_cpu_ptr(pcpu_stats, __cpu); \ ++ u64_stats_init(&stat->syncp); \ ++ } \ ++ } \ ++ pcpu_stats; \ ++}) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0) ++#include "checksum/checksum_partial_compat.h" ++static inline void *our_pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) ++{ ++ if (tail != skb) { ++ skb->data_len += len; ++ skb->len += len; ++ } ++ return skb_put(tail, len); ++} ++#define pskb_put our_pskb_put ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0) && !defined(ISRHEL7) ++#include ++static inline void skb_scrub_packet(struct sk_buff *skb, bool xnet) ++{ ++#ifdef CONFIG_CAVIUM_OCTEON_IPFWD_OFFLOAD ++ memset(&skb->cvm_info, 0, sizeof(skb->cvm_info)); ++ skb->cvm_reserved = 0; ++#endif ++ skb->tstamp.tv64 = 0; ++ skb->pkt_type = PACKET_HOST; ++ skb->skb_iif = 0; ++ skb_dst_drop(skb); ++ secpath_reset(skb); ++ nf_reset(skb); ++ nf_reset_trace(skb); ++ if (!xnet) ++ return; ++ skb_orphan(skb); ++ skb->mark = 0; ++} ++#endif ++ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 12, 0) || defined(ISUBUNTU1404)) && !defined(ISRHEL7) ++#include ++static inline u32 prandom_u32_max(u32 ep_ro) ++{ ++ return (u32)(((u64) prandom_u32() * ep_ro) >> 32); ++} ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 75) && !defined(ISRHEL7) ++#define U8_MAX ((u8)~0U) ++#define S8_MAX ((s8)(U8_MAX >> 1)) ++#define S8_MIN ((s8)(-S8_MAX - 1)) ++#define U16_MAX ((u16)~0U) ++#define S16_MAX ((s16)(U16_MAX >> 1)) ++#define S16_MIN ((s16)(-S16_MAX - 1)) ++#define U32_MAX ((u32)~0U) ++#define S32_MAX ((s32)(U32_MAX >> 1)) ++#define S32_MIN ((s32)(-S32_MAX - 1)) ++#define U64_MAX ((u64)~0ULL) ++#define S64_MAX ((s64)(U64_MAX >> 1)) ++#define S64_MIN ((s64)(-S64_MAX - 1)) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 60) && !defined(ISRHEL7) ++/* Making this static may very well invalidate its usefulness, ++ * but so it goes with compat code. */ ++static inline void memzero_explicit(void *s, size_t count) ++{ ++ memset(s, 0, count); ++ barrier(); ++} ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 12, 0) && !defined(ISRHEL7) ++static const struct in6_addr our_in6addr_any = IN6ADDR_ANY_INIT; ++#define in6addr_any our_in6addr_any ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) ++#include ++#include ++#include ++struct rng_initializer { ++ struct completion done; ++ struct random_ready_callback cb; ++}; ++static inline void rng_initialized_callback(struct random_ready_callback *cb) ++{ ++ complete(&container_of(cb, struct rng_initializer, cb)->done); ++} ++static inline int wait_for_random_bytes(void) ++{ ++ static bool rng_is_initialized = false; ++ int ret; ++ if (unlikely(!rng_is_initialized)) { ++ struct rng_initializer rng = { ++ .done = COMPLETION_INITIALIZER(rng.done), ++ .cb = { .owner = THIS_MODULE, .func = rng_initialized_callback } ++ }; ++ ret = add_random_ready_callback(&rng.cb); ++ if (!ret) { ++ ret = wait_for_completion_interruptible(&rng.done); ++ if (ret) { ++ del_random_ready_callback(&rng.cb); ++ return ret; ++ } ++ } else if (ret != -EALREADY) ++ return ret; ++ rng_is_initialized = true; ++ } ++ return 0; ++} ++#elif LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0) ++/* This is a disaster. Without this API, we really have no way of ++ * knowing if it's initialized. We just return that it has and hope ++ * for the best... */ ++static inline int wait_for_random_bytes(void) ++{ ++ return 0; ++} ++#endif ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0) ++static inline int get_random_bytes_wait(void *buf, int nbytes) ++{ ++ int ret = wait_for_random_bytes(); ++ if (unlikely(ret)) ++ return ret; ++ get_random_bytes(buf, nbytes); ++ return 0; ++} ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0) && !defined(ISRHEL7) ++#define system_power_efficient_wq system_unbound_wq ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 17, 0) && !defined(ISRHEL7) ++#include ++static inline u64 ktime_get_ns(void) ++{ ++ return ktime_to_ns(ktime_get()); ++} ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0) ++#include ++#define inet_confirm_addr(a,b,c,d,e) inet_confirm_addr(b,c,d,e) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) ++#include ++#include ++#include ++static inline void *kvmalloc(size_t size, gfp_t flags) ++{ ++ gfp_t kmalloc_flags = flags; ++ void *ret; ++ if (size > PAGE_SIZE) { ++ kmalloc_flags |= __GFP_NOWARN; ++ if (!(kmalloc_flags & __GFP_REPEAT) || (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) ++ kmalloc_flags |= __GFP_NORETRY; ++ } ++ ret = kmalloc(size, kmalloc_flags); ++ if (ret || size <= PAGE_SIZE) ++ return ret; ++ return __vmalloc(size, flags, PAGE_KERNEL); ++} ++static inline void *kvzalloc(size_t size, gfp_t flags) ++{ ++ return kvmalloc(size, flags | __GFP_ZERO); ++} ++#endif ++ ++#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) || LINUX_VERSION_CODE < KERNEL_VERSION(3, 12, 41)) && !defined(ISUBUNTU1404) ++#include ++static inline void kvfree(const void *addr) ++{ ++ if (is_vmalloc_addr(addr)) ++ vfree(addr); ++ else ++ kfree(addr); ++} ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 9) ++#include ++#define priv_destructor destructor ++#endif ++ ++/* https://lkml.org/lkml/2017/6/23/790 */ ++#if IS_ENABLED(CONFIG_NF_CONNTRACK) ++#include ++#include ++#include ++#include ++#include ++#include ++static inline void new_icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) ++{ ++ enum ip_conntrack_info ctinfo; ++ struct nf_conn *ct = nf_ct_get(skb_in, &ctinfo); ++ if (skb_network_header(skb_in) < skb_in->head || (skb_network_header(skb_in) + sizeof(struct iphdr)) > skb_tail_pointer(skb_in)) ++ return; ++ if (ct) ++ ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip; ++ icmp_send(skb_in, type, code, info); ++} ++static inline void new_icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) ++{ ++ enum ip_conntrack_info ctinfo; ++ struct nf_conn *ct = nf_ct_get(skb, &ctinfo); ++ if (skb_network_header(skb) < skb->head || (skb_network_header(skb) + sizeof(struct ipv6hdr)) > skb_tail_pointer(skb)) ++ return; ++ if (ct) ++ ipv6_hdr(skb)->saddr = ct->tuplehash[0].tuple.src.u3.in6; ++ icmpv6_send(skb, type, code, info); ++} ++#define icmp_send(a,b,c,d) new_icmp_send(a,b,c,d) ++#define icmpv6_send(a,b,c,d) new_icmpv6_send(a,b,c,d) ++#endif ++ +/* https://lkml.org/lkml/2015/6/12/415 */ +#include +static inline struct net_device *netdev_pub(void *dev) @@ -11535,19 +20472,19 @@ +#endif + +#if defined(CONFIG_DYNAMIC_DEBUG) || defined(DEBUG) -+#define net_dbg_skb_ratelimited(fmt, skb, ...) do { \ ++#define net_dbg_skb_ratelimited(fmt, dev, skb, ...) do { \ + struct endpoint __endpoint; \ + socket_endpoint_from_skb(&__endpoint, skb); \ -+ net_dbg_ratelimited(fmt, &__endpoint.addr, ##__VA_ARGS__); \ ++ net_dbg_ratelimited(fmt, dev, &__endpoint.addr, ##__VA_ARGS__); \ +} while(0) +#else +#define net_dbg_skb_ratelimited(fmt, skb, ...) +#endif + +#endif ---- /dev/null -+++ b/net/wireguard/compat/dst_cache/dst_cache.c -@@ -0,0 +1,177 @@ +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/compat/dst_cache/dst_cache.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,175 @@ +/* + * net/core/dst_cache.c - dst entry cache + * @@ -11630,7 +20567,6 @@ + + return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); +} -+EXPORT_SYMBOL_GPL(dst_cache_get); + +struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) +{ @@ -11648,7 +20584,6 @@ + *saddr = idst->in_saddr.s_addr; + return container_of(dst, struct rtable, dst); +} -+EXPORT_SYMBOL_GPL(dst_cache_get_ip4); + +void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, + __be32 saddr) @@ -11662,7 +20597,6 @@ + dst_cache_per_cpu_dst_set(idst, dst, 0); + idst->in_saddr.s_addr = saddr; +} -+EXPORT_SYMBOL_GPL(dst_cache_set_ip4); + +#if IS_ENABLED(CONFIG_IPV6) +void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, @@ -11678,7 +20612,6 @@ + rt6_get_cookie((struct rt6_info *)dst)); + idst->in6_saddr = *addr; +} -+EXPORT_SYMBOL_GPL(dst_cache_set_ip6); + +struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, + struct in6_addr *saddr) @@ -11697,20 +20630,23 @@ + *saddr = idst->in6_saddr; + return dst; +} -+EXPORT_SYMBOL_GPL(dst_cache_get_ip6); +#endif + +int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) +{ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0) ++ BUG_ON(gfp & GFP_ATOMIC); ++ dst_cache->cache = alloc_percpu(struct dst_cache_pcpu); ++#else + dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, + gfp | __GFP_ZERO); ++#endif + if (!dst_cache->cache) + return -ENOMEM; + + dst_cache_reset(dst_cache); + return 0; +} -+EXPORT_SYMBOL_GPL(dst_cache_init); + +void dst_cache_destroy(struct dst_cache *dst_cache) +{ @@ -11724,9 +20660,8 @@ + + free_percpu(dst_cache->cache); +} -+EXPORT_SYMBOL_GPL(dst_cache_destroy); ---- /dev/null -+++ b/net/wireguard/compat/dst_cache/include/net/dst_cache.h +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/compat/dst_cache/include/net/dst_cache.h 2017-07-06 18:17:33.000000000 +0200 @@ -0,0 +1,97 @@ +#ifndef _NET_DST_CACHE_H +#define _NET_DST_CACHE_H @@ -11825,6 +20760,1112 @@ +void dst_cache_destroy(struct dst_cache *dst_cache); + +#endif +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/compat/simd/include/asm/simd.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1 @@ ++#include +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/compat/padata/padata.c 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,895 @@ ++/* ++ * padata.c - generic interface to process data streams in parallel ++ * ++ * See Documentation/padata.txt for an api documentation. ++ * ++ * Copyright (C) 2008, 2009 secunet Security Networks AG ++ * Copyright (C) 2008, 2009 Steffen Klassert ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms and conditions of the GNU General Public License, ++ * version 2, as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ * more details. ++ * ++ * You should have received a copy of the GNU General Public License along with ++ * this program; if not, write to the Free Software Foundation, Inc., ++ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define MAX_OBJ_NUM 1000 ++ ++static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) ++{ ++ int cpu, target_cpu; ++ ++ target_cpu = cpumask_first(pd->cpumask.pcpu); ++ for (cpu = 0; cpu < cpu_index; cpu++) ++ target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu); ++ ++ return target_cpu; ++} ++ ++static int padata_cpu_hash(struct parallel_data *pd) ++{ ++ int cpu_index; ++ /* ++ * Hash the sequence numbers to the cpus by taking ++ * seq_nr mod. number of cpus in use. ++ */ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 13, 0) ++ spin_lock(&pd->seq_lock); ++ cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu); ++ pd->seq_nr++; ++ spin_unlock(&pd->seq_lock); ++#else ++ unsigned int seq_nr = atomic_inc_return(&pd->seq_nr); ++ cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu); ++#endif ++ ++ return padata_index_to_cpu(pd, cpu_index); ++} ++ ++static void padata_parallel_worker(struct work_struct *parallel_work) ++{ ++ struct padata_parallel_queue *pqueue; ++ LIST_HEAD(local_list); ++ ++ local_bh_disable(); ++ pqueue = container_of(parallel_work, ++ struct padata_parallel_queue, work); ++ ++ spin_lock(&pqueue->parallel.lock); ++ list_replace_init(&pqueue->parallel.list, &local_list); ++ spin_unlock(&pqueue->parallel.lock); ++ ++ while (!list_empty(&local_list)) { ++ struct padata_priv *padata; ++ ++ padata = list_entry(local_list.next, ++ struct padata_priv, list); ++ ++ list_del_init(&padata->list); ++ ++ padata->parallel(padata); ++ } ++ ++ local_bh_enable(); ++} ++ ++/** ++ * padata_do_parallel - padata parallelization function ++ * ++ * @pinst: padata instance ++ * @padata: object to be parallelized ++ * @cb_cpu: cpu the serialization callback function will run on, ++ * must be in the serial cpumask of padata(i.e. cpumask.cbcpu). ++ * ++ * The parallelization callback function will run with BHs off. ++ * Note: Every object which is parallelized by padata_do_parallel ++ * must be seen by padata_do_serial. ++ */ ++int padata_do_parallel(struct padata_instance *pinst, ++ struct padata_priv *padata, int cb_cpu) ++{ ++ int target_cpu, err; ++ struct padata_parallel_queue *queue; ++ struct parallel_data *pd; ++ ++ rcu_read_lock_bh(); ++ ++ pd = rcu_dereference_bh(pinst->pd); ++ ++ err = -EINVAL; ++ if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) ++ goto out; ++ ++ if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu)) ++ goto out; ++ ++ err = -EBUSY; ++ if ((pinst->flags & PADATA_RESET)) ++ goto out; ++ ++ if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) ++ goto out; ++ ++ err = 0; ++ atomic_inc(&pd->refcnt); ++ padata->pd = pd; ++ padata->cb_cpu = cb_cpu; ++ ++ target_cpu = padata_cpu_hash(pd); ++ queue = per_cpu_ptr(pd->pqueue, target_cpu); ++ ++ spin_lock(&queue->parallel.lock); ++ list_add_tail(&padata->list, &queue->parallel.list); ++ spin_unlock(&queue->parallel.lock); ++ ++ queue_work_on(target_cpu, pinst->wq, &queue->work); ++ ++out: ++ rcu_read_unlock_bh(); ++ ++ return err; ++} ++ ++/* ++ * padata_get_next - Get the next object that needs serialization. ++ * ++ * Return values are: ++ * ++ * A pointer to the control struct of the next object that needs ++ * serialization, if present in one of the percpu reorder queues. ++ * ++ * -EINPROGRESS, if the next object that needs serialization will ++ * be parallel processed by another cpu and is not yet present in ++ * the cpu's reorder queue. ++ * ++ * -ENODATA, if this cpu has to do the parallel processing for ++ * the next object. ++ */ ++static struct padata_priv *padata_get_next(struct parallel_data *pd) ++{ ++ int cpu, num_cpus; ++ unsigned int next_nr, next_index; ++ struct padata_parallel_queue *next_queue; ++ struct padata_priv *padata; ++ struct padata_list *reorder; ++ ++ num_cpus = cpumask_weight(pd->cpumask.pcpu); ++ ++ /* ++ * Calculate the percpu reorder queue and the sequence ++ * number of the next object. ++ */ ++ next_nr = pd->processed; ++ next_index = next_nr % num_cpus; ++ cpu = padata_index_to_cpu(pd, next_index); ++ next_queue = per_cpu_ptr(pd->pqueue, cpu); ++ ++ reorder = &next_queue->reorder; ++ ++ spin_lock(&reorder->lock); ++ if (!list_empty(&reorder->list)) { ++ padata = list_entry(reorder->list.next, ++ struct padata_priv, list); ++ ++ list_del_init(&padata->list); ++ atomic_dec(&pd->reorder_objects); ++ ++ pd->processed++; ++ ++ spin_unlock(&reorder->lock); ++ goto out; ++ } ++ spin_unlock(&reorder->lock); ++ ++ if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) { ++ padata = ERR_PTR(-ENODATA); ++ goto out; ++ } ++ ++ padata = ERR_PTR(-EINPROGRESS); ++out: ++ return padata; ++} ++ ++static void padata_reorder(struct parallel_data *pd) ++{ ++ int cb_cpu; ++ struct padata_priv *padata; ++ struct padata_serial_queue *squeue; ++ struct padata_instance *pinst = pd->pinst; ++ ++ /* ++ * We need to ensure that only one cpu can work on dequeueing of ++ * the reorder queue the time. Calculating in which percpu reorder ++ * queue the next object will arrive takes some time. A spinlock ++ * would be highly contended. Also it is not clear in which order ++ * the objects arrive to the reorder queues. So a cpu could wait to ++ * get the lock just to notice that there is nothing to do at the ++ * moment. Therefore we use a trylock and let the holder of the lock ++ * care for all the objects enqueued during the holdtime of the lock. ++ */ ++ if (!spin_trylock_bh(&pd->lock)) ++ return; ++ ++ while (1) { ++ padata = padata_get_next(pd); ++ ++ /* ++ * If the next object that needs serialization is parallel ++ * processed by another cpu and is still on it's way to the ++ * cpu's reorder queue, nothing to do for now. ++ */ ++ if (PTR_ERR(padata) == -EINPROGRESS) ++ break; ++ ++ /* ++ * This cpu has to do the parallel processing of the next ++ * object. It's waiting in the cpu's parallelization queue, ++ * so exit immediately. ++ */ ++ if (PTR_ERR(padata) == -ENODATA) { ++ del_timer(&pd->timer); ++ spin_unlock_bh(&pd->lock); ++ return; ++ } ++ ++ cb_cpu = padata->cb_cpu; ++ squeue = per_cpu_ptr(pd->squeue, cb_cpu); ++ ++ spin_lock(&squeue->serial.lock); ++ list_add_tail(&padata->list, &squeue->serial.list); ++ spin_unlock(&squeue->serial.lock); ++ ++ queue_work_on(cb_cpu, pinst->wq, &squeue->work); ++ } ++ ++ spin_unlock_bh(&pd->lock); ++ ++ /* ++ * The next object that needs serialization might have arrived to ++ * the reorder queues in the meantime, we will be called again ++ * from the timer function if no one else cares for it. ++ */ ++ if (atomic_read(&pd->reorder_objects) ++ && !(pinst->flags & PADATA_RESET)) ++ mod_timer(&pd->timer, jiffies + HZ); ++ else ++ del_timer(&pd->timer); ++ ++ return; ++} ++ ++static void padata_reorder_timer(unsigned long arg) ++{ ++ struct parallel_data *pd = (struct parallel_data *)arg; ++ ++ padata_reorder(pd); ++} ++ ++static void padata_serial_worker(struct work_struct *serial_work) ++{ ++ struct padata_serial_queue *squeue; ++ struct parallel_data *pd; ++ LIST_HEAD(local_list); ++ ++ local_bh_disable(); ++ squeue = container_of(serial_work, struct padata_serial_queue, work); ++ pd = squeue->pd; ++ ++ spin_lock(&squeue->serial.lock); ++ list_replace_init(&squeue->serial.list, &local_list); ++ spin_unlock(&squeue->serial.lock); ++ ++ while (!list_empty(&local_list)) { ++ struct padata_priv *padata; ++ ++ padata = list_entry(local_list.next, ++ struct padata_priv, list); ++ ++ list_del_init(&padata->list); ++ ++ padata->serial(padata); ++ atomic_dec(&pd->refcnt); ++ } ++ local_bh_enable(); ++} ++ ++/** ++ * padata_do_serial - padata serialization function ++ * ++ * @padata: object to be serialized. ++ * ++ * padata_do_serial must be called for every parallelized object. ++ * The serialization callback function will run with BHs off. ++ */ ++void padata_do_serial(struct padata_priv *padata) ++{ ++ int cpu; ++ struct padata_parallel_queue *pqueue; ++ struct parallel_data *pd; ++ ++ pd = padata->pd; ++ ++ cpu = get_cpu(); ++ pqueue = per_cpu_ptr(pd->pqueue, cpu); ++ ++ spin_lock(&pqueue->reorder.lock); ++ atomic_inc(&pd->reorder_objects); ++ list_add_tail(&padata->list, &pqueue->reorder.list); ++ spin_unlock(&pqueue->reorder.lock); ++ ++ put_cpu(); ++ ++ padata_reorder(pd); ++} ++ ++static int padata_setup_cpumasks(struct parallel_data *pd, ++ const struct cpumask *pcpumask, ++ const struct cpumask *cbcpumask) ++{ ++ if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); ++ if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { ++ free_cpumask_var(pd->cpumask.pcpu); ++ return -ENOMEM; ++ } ++ ++ cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask); ++ return 0; ++} ++ ++static void __padata_list_init(struct padata_list *pd_list) ++{ ++ INIT_LIST_HEAD(&pd_list->list); ++ spin_lock_init(&pd_list->lock); ++} ++ ++/* Initialize all percpu queues used by serial workers */ ++static void padata_init_squeues(struct parallel_data *pd) ++{ ++ int cpu; ++ struct padata_serial_queue *squeue; ++ ++ for_each_cpu(cpu, pd->cpumask.cbcpu) { ++ squeue = per_cpu_ptr(pd->squeue, cpu); ++ squeue->pd = pd; ++ __padata_list_init(&squeue->serial); ++ INIT_WORK(&squeue->work, padata_serial_worker); ++ } ++} ++ ++/* Initialize all percpu queues used by parallel workers */ ++static void padata_init_pqueues(struct parallel_data *pd) ++{ ++ int cpu_index, cpu; ++ struct padata_parallel_queue *pqueue; ++ ++ cpu_index = 0; ++ for_each_cpu(cpu, pd->cpumask.pcpu) { ++ pqueue = per_cpu_ptr(pd->pqueue, cpu); ++ pqueue->pd = pd; ++ pqueue->cpu_index = cpu_index; ++ cpu_index++; ++ ++ __padata_list_init(&pqueue->reorder); ++ __padata_list_init(&pqueue->parallel); ++ INIT_WORK(&pqueue->work, padata_parallel_worker); ++ atomic_set(&pqueue->num_obj, 0); ++ } ++} ++ ++/* Allocate and initialize the internal cpumask dependend resources. */ ++static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, ++ const struct cpumask *pcpumask, ++ const struct cpumask *cbcpumask) ++{ ++ struct parallel_data *pd; ++ ++ pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); ++ if (!pd) ++ goto err; ++ ++ pd->pqueue = alloc_percpu(struct padata_parallel_queue); ++ if (!pd->pqueue) ++ goto err_free_pd; ++ ++ pd->squeue = alloc_percpu(struct padata_serial_queue); ++ if (!pd->squeue) ++ goto err_free_pqueue; ++ if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0) ++ goto err_free_squeue; ++ ++ padata_init_pqueues(pd); ++ padata_init_squeues(pd); ++ setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 13, 0) ++ pd->seq_nr = 0; ++#else ++ atomic_set(&pd->seq_nr, -1); ++#endif ++ atomic_set(&pd->reorder_objects, 0); ++ atomic_set(&pd->refcnt, 0); ++ pd->pinst = pinst; ++ spin_lock_init(&pd->lock); ++ ++ return pd; ++ ++err_free_squeue: ++ free_percpu(pd->squeue); ++err_free_pqueue: ++ free_percpu(pd->pqueue); ++err_free_pd: ++ kfree(pd); ++err: ++ return NULL; ++} ++ ++static void padata_free_pd(struct parallel_data *pd) ++{ ++ free_cpumask_var(pd->cpumask.pcpu); ++ free_cpumask_var(pd->cpumask.cbcpu); ++ free_percpu(pd->pqueue); ++ free_percpu(pd->squeue); ++ kfree(pd); ++} ++ ++/* Flush all objects out of the padata queues. */ ++static void padata_flush_queues(struct parallel_data *pd) ++{ ++ int cpu; ++ struct padata_parallel_queue *pqueue; ++ struct padata_serial_queue *squeue; ++ ++ for_each_cpu(cpu, pd->cpumask.pcpu) { ++ pqueue = per_cpu_ptr(pd->pqueue, cpu); ++ flush_work(&pqueue->work); ++ } ++ ++ del_timer_sync(&pd->timer); ++ ++ if (atomic_read(&pd->reorder_objects)) ++ padata_reorder(pd); ++ ++ for_each_cpu(cpu, pd->cpumask.cbcpu) { ++ squeue = per_cpu_ptr(pd->squeue, cpu); ++ flush_work(&squeue->work); ++ } ++ ++ BUG_ON(atomic_read(&pd->refcnt) != 0); ++} ++ ++static void __padata_start(struct padata_instance *pinst) ++{ ++ pinst->flags |= PADATA_INIT; ++} ++ ++static void __padata_stop(struct padata_instance *pinst) ++{ ++ if (!(pinst->flags & PADATA_INIT)) ++ return; ++ ++ pinst->flags &= ~PADATA_INIT; ++ ++ synchronize_rcu(); ++ ++ get_online_cpus(); ++ padata_flush_queues(pinst->pd); ++ put_online_cpus(); ++} ++ ++/* Replace the internal control structure with a new one. */ ++static void padata_replace(struct padata_instance *pinst, ++ struct parallel_data *pd_new) ++{ ++ struct parallel_data *pd_old = pinst->pd; ++ int notification_mask = 0; ++ ++ pinst->flags |= PADATA_RESET; ++ ++ rcu_assign_pointer(pinst->pd, pd_new); ++ ++ synchronize_rcu(); ++ ++ if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu)) ++ notification_mask |= PADATA_CPU_PARALLEL; ++ if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu)) ++ notification_mask |= PADATA_CPU_SERIAL; ++ ++ padata_flush_queues(pd_old); ++ padata_free_pd(pd_old); ++ ++ if (notification_mask) ++ blocking_notifier_call_chain(&pinst->cpumask_change_notifier, ++ notification_mask, ++ &pd_new->cpumask); ++ ++ pinst->flags &= ~PADATA_RESET; ++} ++ ++/** ++ * padata_register_cpumask_notifier - Registers a notifier that will be called ++ * if either pcpu or cbcpu or both cpumasks change. ++ * ++ * @pinst: A poineter to padata instance ++ * @nblock: A pointer to notifier block. ++ */ ++int padata_register_cpumask_notifier(struct padata_instance *pinst, ++ struct notifier_block *nblock) ++{ ++ return blocking_notifier_chain_register(&pinst->cpumask_change_notifier, ++ nblock); ++} ++ ++/** ++ * padata_unregister_cpumask_notifier - Unregisters cpumask notifier ++ * registered earlier using padata_register_cpumask_notifier ++ * ++ * @pinst: A pointer to data instance. ++ * @nlock: A pointer to notifier block. ++ */ ++int padata_unregister_cpumask_notifier(struct padata_instance *pinst, ++ struct notifier_block *nblock) ++{ ++ return blocking_notifier_chain_unregister( ++ &pinst->cpumask_change_notifier, ++ nblock); ++} ++ ++ ++/* If cpumask contains no active cpu, we mark the instance as invalid. */ ++static bool padata_validate_cpumask(struct padata_instance *pinst, ++ const struct cpumask *cpumask) ++{ ++ if (!cpumask_intersects(cpumask, cpu_online_mask)) { ++ pinst->flags |= PADATA_INVALID; ++ return false; ++ } ++ ++ pinst->flags &= ~PADATA_INVALID; ++ return true; ++} ++ ++static int __padata_set_cpumasks(struct padata_instance *pinst, ++ cpumask_var_t pcpumask, ++ cpumask_var_t cbcpumask) ++{ ++ int valid; ++ struct parallel_data *pd; ++ ++ valid = padata_validate_cpumask(pinst, pcpumask); ++ if (!valid) { ++ __padata_stop(pinst); ++ goto out_replace; ++ } ++ ++ valid = padata_validate_cpumask(pinst, cbcpumask); ++ if (!valid) ++ __padata_stop(pinst); ++ ++out_replace: ++ pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); ++ if (!pd) ++ return -ENOMEM; ++ ++ cpumask_copy(pinst->cpumask.pcpu, pcpumask); ++ cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); ++ ++ padata_replace(pinst, pd); ++ ++ if (valid) ++ __padata_start(pinst); ++ ++ return 0; ++} ++ ++/** ++ * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value ++ * equivalent to @cpumask. ++ * ++ * @pinst: padata instance ++ * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding ++ * to parallel and serial cpumasks respectively. ++ * @cpumask: the cpumask to use ++ */ ++int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, ++ cpumask_var_t cpumask) ++{ ++ struct cpumask *serial_mask, *parallel_mask; ++ int err = -EINVAL; ++ ++ mutex_lock(&pinst->lock); ++ get_online_cpus(); ++ ++ switch (cpumask_type) { ++ case PADATA_CPU_PARALLEL: ++ serial_mask = pinst->cpumask.cbcpu; ++ parallel_mask = cpumask; ++ break; ++ case PADATA_CPU_SERIAL: ++ parallel_mask = pinst->cpumask.pcpu; ++ serial_mask = cpumask; ++ break; ++ default: ++ goto out; ++ } ++ ++ err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask); ++ ++out: ++ put_online_cpus(); ++ mutex_unlock(&pinst->lock); ++ ++ return err; ++} ++ ++/** ++ * padata_start - start the parallel processing ++ * ++ * @pinst: padata instance to start ++ */ ++int padata_start(struct padata_instance *pinst) ++{ ++ int err = 0; ++ ++ mutex_lock(&pinst->lock); ++ ++ if (pinst->flags & PADATA_INVALID) ++ err = -EINVAL; ++ ++ __padata_start(pinst); ++ ++ mutex_unlock(&pinst->lock); ++ ++ return err; ++} ++ ++/** ++ * padata_stop - stop the parallel processing ++ * ++ * @pinst: padata instance to stop ++ */ ++void padata_stop(struct padata_instance *pinst) ++{ ++ mutex_lock(&pinst->lock); ++ __padata_stop(pinst); ++ mutex_unlock(&pinst->lock); ++} ++ ++static void __padata_free(struct padata_instance *pinst) ++{ ++ padata_stop(pinst); ++ padata_free_pd(pinst->pd); ++ free_cpumask_var(pinst->cpumask.pcpu); ++ free_cpumask_var(pinst->cpumask.cbcpu); ++ kfree(pinst); ++} ++ ++#define kobj2pinst(_kobj) \ ++ container_of(_kobj, struct padata_instance, kobj) ++#define attr2pentry(_attr) \ ++ container_of(_attr, struct padata_sysfs_entry, attr) ++ ++static void padata_sysfs_release(struct kobject *kobj) ++{ ++ struct padata_instance *pinst = kobj2pinst(kobj); ++ __padata_free(pinst); ++} ++ ++struct padata_sysfs_entry { ++ struct attribute attr; ++ ssize_t (*show)(struct padata_instance *, struct attribute *, char *); ++ ssize_t (*store)(struct padata_instance *, struct attribute *, ++ const char *, size_t); ++}; ++ ++static ssize_t show_cpumask(struct padata_instance *pinst, ++ struct attribute *attr, char *buf) ++{ ++ struct cpumask *cpumask; ++ ssize_t len; ++ ++ mutex_lock(&pinst->lock); ++ if (!strcmp(attr->name, "serial_cpumask")) ++ cpumask = pinst->cpumask.cbcpu; ++ else ++ cpumask = pinst->cpumask.pcpu; ++ ++ len = snprintf(buf, PAGE_SIZE, "%*pb\n", ++ nr_cpu_ids, cpumask_bits(cpumask)); ++ mutex_unlock(&pinst->lock); ++ return len < PAGE_SIZE ? len : -EINVAL; ++} ++ ++static ssize_t store_cpumask(struct padata_instance *pinst, ++ struct attribute *attr, ++ const char *buf, size_t count) ++{ ++ cpumask_var_t new_cpumask; ++ ssize_t ret; ++ int mask_type; ++ ++ if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask), ++ nr_cpumask_bits); ++ if (ret < 0) ++ goto out; ++ ++ mask_type = !strcmp(attr->name, "serial_cpumask") ? ++ PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL; ++ ret = padata_set_cpumask(pinst, mask_type, new_cpumask); ++ if (!ret) ++ ret = count; ++ ++out: ++ free_cpumask_var(new_cpumask); ++ return ret; ++} ++ ++#define PADATA_ATTR_RW(_name, _show_name, _store_name) \ ++ static struct padata_sysfs_entry _name##_attr = \ ++ __ATTR(_name, 0644, _show_name, _store_name) ++#define PADATA_ATTR_RO(_name, _show_name) \ ++ static struct padata_sysfs_entry _name##_attr = \ ++ __ATTR(_name, 0400, _show_name, NULL) ++ ++PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask); ++PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask); ++ ++/* ++ * Padata sysfs provides the following objects: ++ * serial_cpumask [RW] - cpumask for serial workers ++ * parallel_cpumask [RW] - cpumask for parallel workers ++ */ ++static struct attribute *padata_default_attrs[] = { ++ &serial_cpumask_attr.attr, ++ ¶llel_cpumask_attr.attr, ++ NULL, ++}; ++ ++static ssize_t padata_sysfs_show(struct kobject *kobj, ++ struct attribute *attr, char *buf) ++{ ++ struct padata_instance *pinst; ++ struct padata_sysfs_entry *pentry; ++ ssize_t ret = -EIO; ++ ++ pinst = kobj2pinst(kobj); ++ pentry = attr2pentry(attr); ++ if (pentry->show) ++ ret = pentry->show(pinst, attr, buf); ++ ++ return ret; ++} ++ ++static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct padata_instance *pinst; ++ struct padata_sysfs_entry *pentry; ++ ssize_t ret = -EIO; ++ ++ pinst = kobj2pinst(kobj); ++ pentry = attr2pentry(attr); ++ if (pentry->show) ++ ret = pentry->store(pinst, attr, buf, count); ++ ++ return ret; ++} ++ ++static const struct sysfs_ops padata_sysfs_ops = { ++ .show = padata_sysfs_show, ++ .store = padata_sysfs_store, ++}; ++ ++static struct kobj_type padata_attr_type = { ++ .sysfs_ops = &padata_sysfs_ops, ++ .default_attrs = padata_default_attrs, ++ .release = padata_sysfs_release, ++}; ++ ++/** ++ * padata_alloc_possible - Allocate and initialize padata instance. ++ * Use the cpu_possible_mask for serial and ++ * parallel workers. ++ * ++ * @wq: workqueue to use for the allocated padata instance ++ */ ++struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) ++{ ++ return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); ++} ++ ++/** ++ * padata_alloc - allocate and initialize a padata instance and specify ++ * cpumasks for serial and parallel workers. ++ * ++ * @wq: workqueue to use for the allocated padata instance ++ * @pcpumask: cpumask that will be used for padata parallelization ++ * @cbcpumask: cpumask that will be used for padata serialization ++ */ ++struct padata_instance *padata_alloc(struct workqueue_struct *wq, ++ const struct cpumask *pcpumask, ++ const struct cpumask *cbcpumask) ++{ ++ struct padata_instance *pinst; ++ struct parallel_data *pd = NULL; ++ ++ pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); ++ if (!pinst) ++ goto err; ++ ++ get_online_cpus(); ++ if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) ++ goto err_free_inst; ++ if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { ++ free_cpumask_var(pinst->cpumask.pcpu); ++ goto err_free_inst; ++ } ++ if (!padata_validate_cpumask(pinst, pcpumask) || ++ !padata_validate_cpumask(pinst, cbcpumask)) ++ goto err_free_masks; ++ ++ pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); ++ if (!pd) ++ goto err_free_masks; ++ ++ rcu_assign_pointer(pinst->pd, pd); ++ ++ pinst->wq = wq; ++ ++ cpumask_copy(pinst->cpumask.pcpu, pcpumask); ++ cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); ++ ++ pinst->flags = 0; ++ ++ put_online_cpus(); ++ ++ BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); ++ kobject_init(&pinst->kobj, &padata_attr_type); ++ mutex_init(&pinst->lock); ++ ++ return pinst; ++ ++err_free_masks: ++ free_cpumask_var(pinst->cpumask.pcpu); ++ free_cpumask_var(pinst->cpumask.cbcpu); ++err_free_inst: ++ kfree(pinst); ++ put_online_cpus(); ++err: ++ return NULL; ++} ++ ++/** ++ * padata_free - free a padata instance ++ * ++ * @padata_inst: padata instance to free ++ */ ++void padata_free(struct padata_instance *pinst) ++{ ++ kobject_put(&pinst->kobj); ++} +--- /dev/null 2017-07-05 16:27:37.615351856 +0200 ++++ b/net/wireguard/compat/checksum/checksum_partial_compat.h 2017-07-06 18:17:33.000000000 +0200 +@@ -0,0 +1,201 @@ ++#include ++#include ++#include ++#include ++#define IP6_MF 0x0001 ++#define IP6_OFFSET 0xFFF8 ++static inline int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, unsigned int max) ++{ ++ if (skb_headlen(skb) >= len) ++ return 0; ++ if (max > skb->len) ++ max = skb->len; ++ if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) ++ return -ENOMEM; ++ if (skb_headlen(skb) < len) ++ return -EPROTO; ++ return 0; ++} ++#define MAX_IP_HDR_LEN 128 ++static inline int skb_checksum_setup_ip(struct sk_buff *skb, bool recalculate) ++{ ++ unsigned int off; ++ bool fragment; ++ int err; ++ fragment = false; ++ err = skb_maybe_pull_tail(skb, sizeof(struct iphdr), MAX_IP_HDR_LEN); ++ if (err < 0) ++ goto out; ++ if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF)) ++ fragment = true; ++ off = ip_hdrlen(skb); ++ err = -EPROTO; ++ if (fragment) ++ goto out; ++ switch (ip_hdr(skb)->protocol) { ++ case IPPROTO_TCP: ++ err = skb_maybe_pull_tail(skb, ++ off + sizeof(struct tcphdr), ++ MAX_IP_HDR_LEN); ++ if (err < 0) ++ goto out; ++ ++ if (!skb_partial_csum_set(skb, off, ++ offsetof(struct tcphdr, check))) { ++ err = -EPROTO; ++ goto out; ++ } ++ ++ if (recalculate) ++ tcp_hdr(skb)->check = ++ ~csum_tcpudp_magic(ip_hdr(skb)->saddr, ++ ip_hdr(skb)->daddr, ++ skb->len - off, ++ IPPROTO_TCP, 0); ++ break; ++ case IPPROTO_UDP: ++ err = skb_maybe_pull_tail(skb, ++ off + sizeof(struct udphdr), ++ MAX_IP_HDR_LEN); ++ if (err < 0) ++ goto out; ++ ++ if (!skb_partial_csum_set(skb, off, ++ offsetof(struct udphdr, check))) { ++ err = -EPROTO; ++ goto out; ++ } ++ ++ if (recalculate) ++ udp_hdr(skb)->check = ++ ~csum_tcpudp_magic(ip_hdr(skb)->saddr, ++ ip_hdr(skb)->daddr, ++ skb->len - off, ++ IPPROTO_UDP, 0); ++ break; ++ default: ++ goto out; ++ } ++ err = 0; ++out: ++ return err; ++} ++#define MAX_IPV6_HDR_LEN 256 ++#define OPT_HDR(type, skb, off) \ ++ (type *)(skb_network_header(skb) + (off)) ++static inline int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) ++{ ++ int err; ++ u8 nexthdr; ++ unsigned int off; ++ unsigned int len; ++ bool fragment; ++ bool done; ++ fragment = false; ++ done = false; ++ off = sizeof(struct ipv6hdr); ++ err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); ++ if (err < 0) ++ goto out; ++ nexthdr = ipv6_hdr(skb)->nexthdr; ++ len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); ++ while (off <= len && !done) { ++ switch (nexthdr) { ++ case IPPROTO_DSTOPTS: ++ case IPPROTO_HOPOPTS: ++ case IPPROTO_ROUTING: { ++ struct ipv6_opt_hdr *hp; ++ ++ err = skb_maybe_pull_tail(skb, off + sizeof(struct ipv6_opt_hdr), MAX_IPV6_HDR_LEN); ++ if (err < 0) ++ goto out; ++ hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); ++ nexthdr = hp->nexthdr; ++ off += ipv6_optlen(hp); ++ break; ++ } ++ case IPPROTO_FRAGMENT: { ++ struct frag_hdr *hp; ++ err = skb_maybe_pull_tail(skb, off + sizeof(struct frag_hdr), MAX_IPV6_HDR_LEN); ++ if (err < 0) ++ goto out; ++ hp = OPT_HDR(struct frag_hdr, skb, off); ++ if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) ++ fragment = true; ++ nexthdr = hp->nexthdr; ++ off += sizeof(struct frag_hdr); ++ break; ++ } ++ default: ++ done = true; ++ break; ++ } ++ } ++ err = -EPROTO; ++ if (!done || fragment) ++ goto out; ++ switch (nexthdr) { ++ case IPPROTO_TCP: ++ err = skb_maybe_pull_tail(skb, ++ off + sizeof(struct tcphdr), ++ MAX_IPV6_HDR_LEN); ++ if (err < 0) ++ goto out; ++ ++ if (!skb_partial_csum_set(skb, off, ++ offsetof(struct tcphdr, check))) { ++ err = -EPROTO; ++ goto out; ++ } ++ ++ if (recalculate) ++ tcp_hdr(skb)->check = ++ ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, ++ &ipv6_hdr(skb)->daddr, ++ skb->len - off, ++ IPPROTO_TCP, 0); ++ break; ++ case IPPROTO_UDP: ++ err = skb_maybe_pull_tail(skb, ++ off + sizeof(struct udphdr), ++ MAX_IPV6_HDR_LEN); ++ if (err < 0) ++ goto out; ++ ++ if (!skb_partial_csum_set(skb, off, ++ offsetof(struct udphdr, check))) { ++ err = -EPROTO; ++ goto out; ++ } ++ ++ if (recalculate) ++ udp_hdr(skb)->check = ++ ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, ++ &ipv6_hdr(skb)->daddr, ++ skb->len - off, ++ IPPROTO_UDP, 0); ++ break; ++ default: ++ goto out; ++ } ++ err = 0; ++out: ++ return err; ++} ++static inline int skb_checksum_setup(struct sk_buff *skb, bool recalculate) ++{ ++ int err; ++ switch (skb->protocol) { ++ case htons(ETH_P_IP): ++ err = skb_checksum_setup_ip(skb, recalculate); ++ break; ++ ++ case htons(ETH_P_IPV6): ++ err = skb_checksum_setup_ipv6(skb, recalculate); ++ break; ++ default: ++ err = -EPROTO; ++ break; ++ } ++ return err; ++} --- a/net/Kconfig +++ b/net/Kconfig @@ -85,2 +85,3 @@ config INET