Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'md/4.9-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

Pull MD updates from Shaohua Li:
"This update includes:

- new AVX512 instruction based raid6 gen/recovery algorithm

- a couple of md-cluster related bug fixes

- fix a potential deadlock

- set nonrotational bit for raid array with SSD

- set correct max_hw_sectors for raid5/6, which hopefuly can improve
performance a little bit

- other minor fixes"

* tag 'md/4.9-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md:
md: set rotational bit
raid6/test/test.c: bug fix: Specify aligned(alignment) attributes to the char arrays
raid5: handle register_shrinker failure
raid5: fix to detect failure of register_shrinker
md: fix a potential deadlock
md/bitmap: fix wrong cleanup
raid5: allow arbitrary max_hw_sectors
lib/raid6: Add AVX512 optimized xor_syndrome functions
lib/raid6/test/Makefile: Add avx512 gen_syndrome and recovery functions
lib/raid6: Add AVX512 optimized recovery functions
lib/raid6: Add AVX512 optimized gen_syndrome functions
md-cluster: make resync lock also could be interruptted
md-cluster: introduce dlm_lock_sync_interruptible to fix tasks hang
md-cluster: convert the completion to wait queue
md-cluster: protect md_find_rdev_nr_rcu with rcu lock
md-cluster: clean related infos of cluster
md: changes for MD_STILL_CLOSED flag
md-cluster: remove some unnecessary dlm_unlock_sync
md-cluster: use FORCEUNLOCK in lockres_free
md-cluster: call md_kick_rdev_from_array once ack failed

+1111 -54
+3 -2
arch/x86/Makefile
··· 163 163 asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1) 164 164 avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) 165 165 avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) 166 + avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1) 166 167 sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1) 167 168 sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1) 168 169 169 - KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr) 170 - KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr) 170 + KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) 171 + KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) 171 172 172 173 LDFLAGS := -m elf_$(UTS_MACHINE) 173 174
+1 -3
drivers/md/bitmap.c
··· 1903 1903 struct bitmap_counts *counts; 1904 1904 struct bitmap *bitmap = bitmap_create(mddev, slot); 1905 1905 1906 - if (IS_ERR(bitmap)) { 1907 - bitmap_free(bitmap); 1906 + if (IS_ERR(bitmap)) 1908 1907 return PTR_ERR(bitmap); 1909 - } 1910 1908 1911 1909 rv = bitmap_init_from_disk(bitmap, 0); 1912 1910 if (rv)
+69 -30
drivers/md/md-cluster.c
··· 10 10 11 11 12 12 #include <linux/module.h> 13 + #include <linux/kthread.h> 13 14 #include <linux/dlm.h> 14 15 #include <linux/sched.h> 15 16 #include <linux/raid/md_p.h> ··· 26 25 struct dlm_lksb lksb; 27 26 char *name; /* lock name. */ 28 27 uint32_t flags; /* flags to pass to dlm_lock() */ 29 - struct completion completion; /* completion for synchronized locking */ 28 + wait_queue_head_t sync_locking; /* wait queue for synchronized locking */ 29 + bool sync_locking_done; 30 30 void (*bast)(void *arg, int mode); /* blocking AST function pointer*/ 31 31 struct mddev *mddev; /* pointing back to mddev. */ 32 32 int mode; ··· 120 118 struct dlm_lock_resource *res; 121 119 122 120 res = arg; 123 - complete(&res->completion); 121 + res->sync_locking_done = true; 122 + wake_up(&res->sync_locking); 124 123 } 125 124 126 125 static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) ··· 133 130 0, sync_ast, res, res->bast); 134 131 if (ret) 135 132 return ret; 136 - wait_for_completion(&res->completion); 133 + wait_event(res->sync_locking, res->sync_locking_done); 134 + res->sync_locking_done = false; 137 135 if (res->lksb.sb_status == 0) 138 136 res->mode = mode; 139 137 return res->lksb.sb_status; ··· 143 139 static int dlm_unlock_sync(struct dlm_lock_resource *res) 144 140 { 145 141 return dlm_lock_sync(res, DLM_LOCK_NL); 142 + } 143 + 144 + /* 145 + * An variation of dlm_lock_sync, which make lock request could 146 + * be interrupted 147 + */ 148 + static int dlm_lock_sync_interruptible(struct dlm_lock_resource *res, int mode, 149 + struct mddev *mddev) 150 + { 151 + int ret = 0; 152 + 153 + ret = dlm_lock(res->ls, mode, &res->lksb, 154 + res->flags, res->name, strlen(res->name), 155 + 0, sync_ast, res, res->bast); 156 + if (ret) 157 + return ret; 158 + 159 + wait_event(res->sync_locking, res->sync_locking_done 160 + || kthread_should_stop() 161 + || test_bit(MD_CLOSING, &mddev->flags)); 162 + if (!res->sync_locking_done) { 163 + /* 164 + * the convert queue contains the lock request when request is 165 + * interrupted, and sync_ast could still be run, so need to 166 + * cancel the request and reset completion 167 + */ 168 + ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_CANCEL, 169 + &res->lksb, res); 170 + res->sync_locking_done = false; 171 + if (unlikely(ret != 0)) 172 + pr_info("failed to cancel previous lock request " 173 + "%s return %d\n", res->name, ret); 174 + return -EPERM; 175 + } else 176 + res->sync_locking_done = false; 177 + if (res->lksb.sb_status == 0) 178 + res->mode = mode; 179 + return res->lksb.sb_status; 146 180 } 147 181 148 182 static struct dlm_lock_resource *lockres_init(struct mddev *mddev, ··· 193 151 res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); 194 152 if (!res) 195 153 return NULL; 196 - init_completion(&res->completion); 154 + init_waitqueue_head(&res->sync_locking); 155 + res->sync_locking_done = false; 197 156 res->ls = cinfo->lockspace; 198 157 res->mddev = mddev; 199 158 res->mode = DLM_LOCK_IV; ··· 237 194 238 195 static void lockres_free(struct dlm_lock_resource *res) 239 196 { 240 - int ret; 197 + int ret = 0; 241 198 242 199 if (!res) 243 200 return; 244 201 245 - /* cancel a lock request or a conversion request that is blocked */ 246 - res->flags |= DLM_LKF_CANCEL; 247 - retry: 248 - ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res); 249 - if (unlikely(ret != 0)) { 250 - pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret); 251 - 252 - /* if a lock conversion is cancelled, then the lock is put 253 - * back to grant queue, need to ensure it is unlocked */ 254 - if (ret == -DLM_ECANCEL) 255 - goto retry; 256 - } 257 - res->flags &= ~DLM_LKF_CANCEL; 258 - wait_for_completion(&res->completion); 202 + /* 203 + * use FORCEUNLOCK flag, so we can unlock even the lock is on the 204 + * waiting or convert queue 205 + */ 206 + ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_FORCEUNLOCK, 207 + &res->lksb, res); 208 + if (unlikely(ret != 0)) 209 + pr_err("failed to unlock %s return %d\n", res->name, ret); 210 + else 211 + wait_event(res->sync_locking, res->sync_locking_done); 259 212 260 213 kfree(res->name); 261 214 kfree(res->lksb.sb_lvbptr); ··· 318 279 goto clear_bit; 319 280 } 320 281 321 - ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); 282 + ret = dlm_lock_sync_interruptible(bm_lockres, DLM_LOCK_PW, mddev); 322 283 if (ret) { 323 284 pr_err("md-cluster: Could not DLM lock %s: %d\n", 324 285 str, ret); ··· 327 288 ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true); 328 289 if (ret) { 329 290 pr_err("md-cluster: Could not copy data from bitmap %d\n", slot); 330 - goto dlm_unlock; 291 + goto clear_bit; 331 292 } 332 293 if (hi > 0) { 333 294 if (lo < mddev->recovery_cp) ··· 339 300 md_wakeup_thread(mddev->thread); 340 301 } 341 302 } 342 - dlm_unlock: 343 - dlm_unlock_sync(bm_lockres); 344 303 clear_bit: 345 304 lockres_free(bm_lockres); 346 305 clear_bit(slot, &cinfo->recovery_map); ··· 532 495 533 496 static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) 534 497 { 535 - struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, 536 - le32_to_cpu(msg->raid_slot)); 498 + struct md_rdev *rdev; 537 499 500 + rcu_read_lock(); 501 + rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot)); 538 502 if (rdev) { 539 503 set_bit(ClusterRemove, &rdev->flags); 540 504 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); ··· 544 506 else 545 507 pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", 546 508 __func__, __LINE__, le32_to_cpu(msg->raid_slot)); 509 + rcu_read_unlock(); 547 510 } 548 511 549 512 static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg) 550 513 { 551 - struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, 552 - le32_to_cpu(msg->raid_slot)); 514 + struct md_rdev *rdev; 553 515 516 + rcu_read_lock(); 517 + rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot)); 554 518 if (rdev && test_bit(Faulty, &rdev->flags)) 555 519 clear_bit(Faulty, &rdev->flags); 556 520 else 557 521 pr_warn("%s: %d Could not find disk(%d) which is faulty", 558 522 __func__, __LINE__, le32_to_cpu(msg->raid_slot)); 523 + rcu_read_unlock(); 559 524 } 560 525 561 526 static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) ··· 811 770 md_check_recovery(mddev); 812 771 } 813 772 814 - dlm_unlock_sync(bm_lockres); 815 773 lockres_free(bm_lockres); 816 774 } 817 775 out: ··· 1046 1006 static int resync_start(struct mddev *mddev) 1047 1007 { 1048 1008 struct md_cluster_info *cinfo = mddev->cluster_info; 1049 - return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX); 1009 + return dlm_lock_sync_interruptible(cinfo->resync_lockres, DLM_LOCK_EX, mddev); 1050 1010 } 1051 1011 1052 1012 static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) ··· 1226 1186 if (cinfo->other_bitmap_lockres) { 1227 1187 for (i = 0; i < mddev->bitmap_info.nodes - 1; i++) { 1228 1188 if (cinfo->other_bitmap_lockres[i]) { 1229 - dlm_unlock_sync(cinfo->other_bitmap_lockres[i]); 1230 1189 lockres_free(cinfo->other_bitmap_lockres[i]); 1231 1190 } 1232 1191 }
+35 -9
drivers/md/md.c
··· 5297 5297 return err; 5298 5298 } 5299 5299 if (mddev->queue) { 5300 + bool nonrot = true; 5301 + 5302 + rdev_for_each(rdev, mddev) { 5303 + if (rdev->raid_disk >= 0 && 5304 + !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { 5305 + nonrot = false; 5306 + break; 5307 + } 5308 + } 5309 + if (mddev->degraded) 5310 + nonrot = false; 5311 + if (nonrot) 5312 + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mddev->queue); 5313 + else 5314 + queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, mddev->queue); 5300 5315 mddev->queue->backing_dev_info.congested_data = mddev; 5301 5316 mddev->queue->backing_dev_info.congested_fn = md_congested; 5302 5317 } ··· 5469 5454 mddev->degraded = 0; 5470 5455 mddev->safemode = 0; 5471 5456 mddev->private = NULL; 5457 + mddev->cluster_info = NULL; 5472 5458 mddev->bitmap_info.offset = 0; 5473 5459 mddev->bitmap_info.default_offset = 0; 5474 5460 mddev->bitmap_info.default_space = 0; 5475 5461 mddev->bitmap_info.chunksize = 0; 5476 5462 mddev->bitmap_info.daemon_sleep = 0; 5477 5463 mddev->bitmap_info.max_write_behind = 0; 5464 + mddev->bitmap_info.nodes = 0; 5478 5465 } 5479 5466 5480 5467 static void __md_stop_writes(struct mddev *mddev) ··· 5590 5573 mutex_lock(&mddev->open_mutex); 5591 5574 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 5592 5575 mddev->sync_thread || 5593 - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 5594 - (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { 5576 + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5595 5577 printk("md: %s still in use.\n",mdname(mddev)); 5596 5578 if (did_freeze) { 5597 5579 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); ··· 5652 5636 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 5653 5637 mddev->sysfs_active || 5654 5638 mddev->sync_thread || 5655 - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 5656 - (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { 5639 + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5657 5640 printk("md: %s still in use.\n",mdname(mddev)); 5658 5641 mutex_unlock(&mddev->open_mutex); 5659 5642 if (did_freeze) { ··· 6116 6101 export_rdev(rdev); 6117 6102 6118 6103 if (mddev_is_clustered(mddev)) { 6119 - if (info->state & (1 << MD_DISK_CANDIDATE)) 6120 - md_cluster_ops->new_disk_ack(mddev, (err == 0)); 6121 - else { 6104 + if (info->state & (1 << MD_DISK_CANDIDATE)) { 6105 + if (!err) { 6106 + err = md_cluster_ops->new_disk_ack(mddev, 6107 + err == 0); 6108 + if (err) 6109 + md_kick_rdev_from_array(rdev); 6110 + } 6111 + } else { 6122 6112 if (err) 6123 6113 md_cluster_ops->add_new_disk_cancel(mddev); 6124 6114 else ··· 6841 6821 err = -EBUSY; 6842 6822 goto out; 6843 6823 } 6844 - set_bit(MD_STILL_CLOSED, &mddev->flags); 6824 + set_bit(MD_CLOSING, &mddev->flags); 6845 6825 mutex_unlock(&mddev->open_mutex); 6846 6826 sync_blockdev(bdev); 6847 6827 } ··· 7090 7070 if ((err = mutex_lock_interruptible(&mddev->open_mutex))) 7091 7071 goto out; 7092 7072 7073 + if (test_bit(MD_CLOSING, &mddev->flags)) { 7074 + mutex_unlock(&mddev->open_mutex); 7075 + return -ENODEV; 7076 + } 7077 + 7093 7078 err = 0; 7094 7079 atomic_inc(&mddev->openers); 7095 - clear_bit(MD_STILL_CLOSED, &mddev->flags); 7096 7080 mutex_unlock(&mddev->open_mutex); 7097 7081 7098 7082 check_disk_change(bdev); ··· 8897 8873 list_del(&node_detected_dev->list); 8898 8874 dev = node_detected_dev->dev; 8899 8875 kfree(node_detected_dev); 8876 + mutex_unlock(&detected_devices_mutex); 8900 8877 rdev = md_import_device(dev,0, 90); 8878 + mutex_lock(&detected_devices_mutex); 8901 8879 if (IS_ERR(rdev)) 8902 8880 continue; 8903 8881
+2 -3
drivers/md/md.h
··· 201 201 #define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ 202 202 #define MD_UPDATE_SB_FLAGS (1 | 2 | 4) /* If these are set, md_update_sb needed */ 203 203 #define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */ 204 - #define MD_STILL_CLOSED 4 /* If set, then array has not been opened since 205 - * md_ioctl checked on it. 206 - */ 204 + #define MD_CLOSING 4 /* If set, we are closing the array, do not open 205 + * it then */ 207 206 #define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */ 208 207 #define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */ 209 208 #define MD_RELOAD_SB 7 /* Reload the superblock because another node
+9 -2
drivers/md/raid5.c
··· 6370 6370 { 6371 6371 if (conf->log) 6372 6372 r5l_exit_log(conf->log); 6373 - if (conf->shrinker.seeks) 6373 + if (conf->shrinker.nr_deferred) 6374 6374 unregister_shrinker(&conf->shrinker); 6375 6375 6376 6376 free_thread_groups(conf); ··· 6632 6632 conf->shrinker.count_objects = raid5_cache_count; 6633 6633 conf->shrinker.batch = 128; 6634 6634 conf->shrinker.flags = 0; 6635 - register_shrinker(&conf->shrinker); 6635 + if (register_shrinker(&conf->shrinker)) { 6636 + printk(KERN_ERR 6637 + "md/raid:%s: couldn't register shrinker.\n", 6638 + mdname(mddev)); 6639 + goto abort; 6640 + } 6636 6641 6637 6642 sprintf(pers_name, "raid%d", mddev->new_level); 6638 6643 conf->thread = md_register_thread(raid5d, mddev, pers_name); ··· 7033 7028 else 7034 7029 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 7035 7030 mddev->queue); 7031 + 7032 + blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); 7036 7033 } 7037 7034 7038 7035 if (journal_dev) {
+4
include/linux/raid/pq.h
··· 102 102 extern const struct raid6_calls raid6_avx2x1; 103 103 extern const struct raid6_calls raid6_avx2x2; 104 104 extern const struct raid6_calls raid6_avx2x4; 105 + extern const struct raid6_calls raid6_avx512x1; 106 + extern const struct raid6_calls raid6_avx512x2; 107 + extern const struct raid6_calls raid6_avx512x4; 105 108 extern const struct raid6_calls raid6_tilegx8; 106 109 extern const struct raid6_calls raid6_s390vx8; 107 110 ··· 119 116 extern const struct raid6_recov_calls raid6_recov_intx1; 120 117 extern const struct raid6_recov_calls raid6_recov_ssse3; 121 118 extern const struct raid6_recov_calls raid6_recov_avx2; 119 + extern const struct raid6_recov_calls raid6_recov_avx512; 122 120 extern const struct raid6_recov_calls raid6_recov_s390xc; 123 121 124 122 extern const struct raid6_calls raid6_neonx1;
+1 -1
lib/raid6/Makefile
··· 3 3 raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ 4 4 int8.o int16.o int32.o 5 5 6 - raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o 6 + raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o 7 7 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o 8 8 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o 9 9 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
+12
lib/raid6/algos.c
··· 49 49 &raid6_avx2x1, 50 50 &raid6_avx2x2, 51 51 #endif 52 + #ifdef CONFIG_AS_AVX512 53 + &raid6_avx512x1, 54 + &raid6_avx512x2, 55 + #endif 52 56 #endif 53 57 #if defined(__x86_64__) && !defined(__arch_um__) 54 58 &raid6_sse2x1, ··· 62 58 &raid6_avx2x1, 63 59 &raid6_avx2x2, 64 60 &raid6_avx2x4, 61 + #endif 62 + #ifdef CONFIG_AS_AVX512 63 + &raid6_avx512x1, 64 + &raid6_avx512x2, 65 + &raid6_avx512x4, 65 66 #endif 66 67 #endif 67 68 #ifdef CONFIG_ALTIVEC ··· 101 92 EXPORT_SYMBOL_GPL(raid6_datap_recov); 102 93 103 94 const struct raid6_recov_calls *const raid6_recov_algos[] = { 95 + #ifdef CONFIG_AS_AVX512 96 + &raid6_recov_avx512, 97 + #endif 104 98 #ifdef CONFIG_AS_AVX2 105 99 &raid6_recov_avx2, 106 100 #endif
+569
lib/raid6/avx512.c
··· 1 + /* -*- linux-c -*- -------------------------------------------------------- 2 + * 3 + * Copyright (C) 2016 Intel Corporation 4 + * 5 + * Author: Gayatri Kammela <gayatri.kammela@intel.com> 6 + * Author: Megha Dey <megha.dey@linux.intel.com> 7 + * 8 + * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved 9 + * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved 10 + * 11 + * This program is free software; you can redistribute it and/or modify 12 + * it under the terms of the GNU General Public License as published by 13 + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, 14 + * Boston MA 02111-1307, USA; either version 2 of the License, or 15 + * (at your option) any later version; incorporated herein by reference. 16 + * 17 + * ----------------------------------------------------------------------- 18 + */ 19 + 20 + /* 21 + * AVX512 implementation of RAID-6 syndrome functions 22 + * 23 + */ 24 + 25 + #ifdef CONFIG_AS_AVX512 26 + 27 + #include <linux/raid/pq.h> 28 + #include "x86.h" 29 + 30 + static const struct raid6_avx512_constants { 31 + u64 x1d[8]; 32 + } raid6_avx512_constants __aligned(512) = { 33 + { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 34 + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 35 + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 36 + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, 37 + }; 38 + 39 + static int raid6_have_avx512(void) 40 + { 41 + return boot_cpu_has(X86_FEATURE_AVX2) && 42 + boot_cpu_has(X86_FEATURE_AVX) && 43 + boot_cpu_has(X86_FEATURE_AVX512F) && 44 + boot_cpu_has(X86_FEATURE_AVX512BW) && 45 + boot_cpu_has(X86_FEATURE_AVX512VL) && 46 + boot_cpu_has(X86_FEATURE_AVX512DQ); 47 + } 48 + 49 + static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs) 50 + { 51 + u8 **dptr = (u8 **)ptrs; 52 + u8 *p, *q; 53 + int d, z, z0; 54 + 55 + z0 = disks - 3; /* Highest data disk */ 56 + p = dptr[z0+1]; /* XOR parity */ 57 + q = dptr[z0+2]; /* RS syndrome */ 58 + 59 + kernel_fpu_begin(); 60 + 61 + asm volatile("vmovdqa64 %0,%%zmm0\n\t" 62 + "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ 63 + : 64 + : "m" (raid6_avx512_constants.x1d[0])); 65 + 66 + for (d = 0; d < bytes; d += 64) { 67 + asm volatile("prefetchnta %0\n\t" 68 + "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ 69 + "prefetchnta %1\n\t" 70 + "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ 71 + "vmovdqa64 %1,%%zmm6" 72 + : 73 + : "m" (dptr[z0][d]), "m" (dptr[z0-1][d])); 74 + for (z = z0-2; z >= 0; z--) { 75 + asm volatile("prefetchnta %0\n\t" 76 + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 77 + "vpmovm2b %%k1,%%zmm5\n\t" 78 + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 79 + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 80 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 81 + "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" 82 + "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" 83 + "vmovdqa64 %0,%%zmm6" 84 + : 85 + : "m" (dptr[z][d])); 86 + } 87 + asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 88 + "vpmovm2b %%k1,%%zmm5\n\t" 89 + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 90 + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 91 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 92 + "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" 93 + "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" 94 + "vmovntdq %%zmm2,%0\n\t" 95 + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" 96 + "vmovntdq %%zmm4,%1\n\t" 97 + "vpxorq %%zmm4,%%zmm4,%%zmm4" 98 + : 99 + : "m" (p[d]), "m" (q[d])); 100 + } 101 + 102 + asm volatile("sfence" : : : "memory"); 103 + kernel_fpu_end(); 104 + } 105 + 106 + static void raid6_avx5121_xor_syndrome(int disks, int start, int stop, 107 + size_t bytes, void **ptrs) 108 + { 109 + u8 **dptr = (u8 **)ptrs; 110 + u8 *p, *q; 111 + int d, z, z0; 112 + 113 + z0 = stop; /* P/Q right side optimization */ 114 + p = dptr[disks-2]; /* XOR parity */ 115 + q = dptr[disks-1]; /* RS syndrome */ 116 + 117 + kernel_fpu_begin(); 118 + 119 + asm volatile("vmovdqa64 %0,%%zmm0" 120 + : : "m" (raid6_avx512_constants.x1d[0])); 121 + 122 + for (d = 0 ; d < bytes ; d += 64) { 123 + asm volatile("vmovdqa64 %0,%%zmm4\n\t" 124 + "vmovdqa64 %1,%%zmm2\n\t" 125 + "vpxorq %%zmm4,%%zmm2,%%zmm2" 126 + : 127 + : "m" (dptr[z0][d]), "m" (p[d])); 128 + /* P/Q data pages */ 129 + for (z = z0-1 ; z >= start ; z--) { 130 + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 131 + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 132 + "vpmovm2b %%k1,%%zmm5\n\t" 133 + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 134 + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 135 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 136 + "vmovdqa64 %0,%%zmm5\n\t" 137 + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 138 + "vpxorq %%zmm5,%%zmm4,%%zmm4" 139 + : 140 + : "m" (dptr[z][d])); 141 + } 142 + /* P/Q left side optimization */ 143 + for (z = start-1 ; z >= 0 ; z--) { 144 + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 145 + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 146 + "vpmovm2b %%k1,%%zmm5\n\t" 147 + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 148 + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 149 + "vpxorq %%zmm5,%%zmm4,%%zmm4" 150 + : 151 + : ); 152 + } 153 + asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" 154 + /* Don't use movntdq for r/w memory area < cache line */ 155 + "vmovdqa64 %%zmm4,%0\n\t" 156 + "vmovdqa64 %%zmm2,%1" 157 + : 158 + : "m" (q[d]), "m" (p[d])); 159 + } 160 + 161 + asm volatile("sfence" : : : "memory"); 162 + kernel_fpu_end(); 163 + } 164 + 165 + const struct raid6_calls raid6_avx512x1 = { 166 + raid6_avx5121_gen_syndrome, 167 + raid6_avx5121_xor_syndrome, 168 + raid6_have_avx512, 169 + "avx512x1", 170 + 1 /* Has cache hints */ 171 + }; 172 + 173 + /* 174 + * Unrolled-by-2 AVX512 implementation 175 + */ 176 + static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs) 177 + { 178 + u8 **dptr = (u8 **)ptrs; 179 + u8 *p, *q; 180 + int d, z, z0; 181 + 182 + z0 = disks - 3; /* Highest data disk */ 183 + p = dptr[z0+1]; /* XOR parity */ 184 + q = dptr[z0+2]; /* RS syndrome */ 185 + 186 + kernel_fpu_begin(); 187 + 188 + asm volatile("vmovdqa64 %0,%%zmm0\n\t" 189 + "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ 190 + : 191 + : "m" (raid6_avx512_constants.x1d[0])); 192 + 193 + /* We uniformly assume a single prefetch covers at least 64 bytes */ 194 + for (d = 0; d < bytes; d += 128) { 195 + asm volatile("prefetchnta %0\n\t" 196 + "prefetchnta %1\n\t" 197 + "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ 198 + "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */ 199 + "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ 200 + "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */ 201 + : 202 + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64])); 203 + for (z = z0-1; z >= 0; z--) { 204 + asm volatile("prefetchnta %0\n\t" 205 + "prefetchnta %1\n\t" 206 + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 207 + "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" 208 + "vpmovm2b %%k1,%%zmm5\n\t" 209 + "vpmovm2b %%k2,%%zmm7\n\t" 210 + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 211 + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 212 + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 213 + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 214 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 215 + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 216 + "vmovdqa64 %0,%%zmm5\n\t" 217 + "vmovdqa64 %1,%%zmm7\n\t" 218 + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 219 + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 220 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 221 + "vpxorq %%zmm7,%%zmm6,%%zmm6" 222 + : 223 + : "m" (dptr[z][d]), "m" (dptr[z][d+64])); 224 + } 225 + asm volatile("vmovntdq %%zmm2,%0\n\t" 226 + "vmovntdq %%zmm3,%1\n\t" 227 + "vmovntdq %%zmm4,%2\n\t" 228 + "vmovntdq %%zmm6,%3" 229 + : 230 + : "m" (p[d]), "m" (p[d+64]), "m" (q[d]), 231 + "m" (q[d+64])); 232 + } 233 + 234 + asm volatile("sfence" : : : "memory"); 235 + kernel_fpu_end(); 236 + } 237 + 238 + static void raid6_avx5122_xor_syndrome(int disks, int start, int stop, 239 + size_t bytes, void **ptrs) 240 + { 241 + u8 **dptr = (u8 **)ptrs; 242 + u8 *p, *q; 243 + int d, z, z0; 244 + 245 + z0 = stop; /* P/Q right side optimization */ 246 + p = dptr[disks-2]; /* XOR parity */ 247 + q = dptr[disks-1]; /* RS syndrome */ 248 + 249 + kernel_fpu_begin(); 250 + 251 + asm volatile("vmovdqa64 %0,%%zmm0" 252 + : : "m" (raid6_avx512_constants.x1d[0])); 253 + 254 + for (d = 0 ; d < bytes ; d += 128) { 255 + asm volatile("vmovdqa64 %0,%%zmm4\n\t" 256 + "vmovdqa64 %1,%%zmm6\n\t" 257 + "vmovdqa64 %2,%%zmm2\n\t" 258 + "vmovdqa64 %3,%%zmm3\n\t" 259 + "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" 260 + "vpxorq %%zmm6,%%zmm3,%%zmm3" 261 + : 262 + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), 263 + "m" (p[d]), "m" (p[d+64])); 264 + /* P/Q data pages */ 265 + for (z = z0-1 ; z >= start ; z--) { 266 + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 267 + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 268 + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 269 + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 270 + "vpmovm2b %%k1,%%zmm5\n\t" 271 + "vpmovm2b %%k2,%%zmm7\n\t" 272 + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 273 + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 274 + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 275 + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 276 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 277 + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 278 + "vmovdqa64 %0,%%zmm5\n\t" 279 + "vmovdqa64 %1,%%zmm7\n\t" 280 + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 281 + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 282 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 283 + "vpxorq %%zmm7,%%zmm6,%%zmm6" 284 + : 285 + : "m" (dptr[z][d]), "m" (dptr[z][d+64])); 286 + } 287 + /* P/Q left side optimization */ 288 + for (z = start-1 ; z >= 0 ; z--) { 289 + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 290 + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 291 + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 292 + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 293 + "vpmovm2b %%k1,%%zmm5\n\t" 294 + "vpmovm2b %%k2,%%zmm7\n\t" 295 + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 296 + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 297 + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 298 + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 299 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 300 + "vpxorq %%zmm7,%%zmm6,%%zmm6" 301 + : 302 + : ); 303 + } 304 + asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" 305 + "vpxorq %1,%%zmm6,%%zmm6\n\t" 306 + /* Don't use movntdq for r/w 307 + * memory area < cache line 308 + */ 309 + "vmovdqa64 %%zmm4,%0\n\t" 310 + "vmovdqa64 %%zmm6,%1\n\t" 311 + "vmovdqa64 %%zmm2,%2\n\t" 312 + "vmovdqa64 %%zmm3,%3" 313 + : 314 + : "m" (q[d]), "m" (q[d+64]), "m" (p[d]), 315 + "m" (p[d+64])); 316 + } 317 + 318 + asm volatile("sfence" : : : "memory"); 319 + kernel_fpu_end(); 320 + } 321 + 322 + const struct raid6_calls raid6_avx512x2 = { 323 + raid6_avx5122_gen_syndrome, 324 + raid6_avx5122_xor_syndrome, 325 + raid6_have_avx512, 326 + "avx512x2", 327 + 1 /* Has cache hints */ 328 + }; 329 + 330 + #ifdef CONFIG_X86_64 331 + 332 + /* 333 + * Unrolled-by-4 AVX2 implementation 334 + */ 335 + static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs) 336 + { 337 + u8 **dptr = (u8 **)ptrs; 338 + u8 *p, *q; 339 + int d, z, z0; 340 + 341 + z0 = disks - 3; /* Highest data disk */ 342 + p = dptr[z0+1]; /* XOR parity */ 343 + q = dptr[z0+2]; /* RS syndrome */ 344 + 345 + kernel_fpu_begin(); 346 + 347 + asm volatile("vmovdqa64 %0,%%zmm0\n\t" 348 + "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */ 349 + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */ 350 + "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */ 351 + "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */ 352 + "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */ 353 + "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */ 354 + "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */ 355 + "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */ 356 + "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */ 357 + : 358 + : "m" (raid6_avx512_constants.x1d[0])); 359 + 360 + for (d = 0; d < bytes; d += 256) { 361 + for (z = z0; z >= 0; z--) { 362 + asm volatile("prefetchnta %0\n\t" 363 + "prefetchnta %1\n\t" 364 + "prefetchnta %2\n\t" 365 + "prefetchnta %3\n\t" 366 + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 367 + "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" 368 + "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t" 369 + "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t" 370 + "vpmovm2b %%k1,%%zmm5\n\t" 371 + "vpmovm2b %%k2,%%zmm7\n\t" 372 + "vpmovm2b %%k3,%%zmm13\n\t" 373 + "vpmovm2b %%k4,%%zmm15\n\t" 374 + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 375 + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 376 + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" 377 + "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" 378 + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 379 + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 380 + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" 381 + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" 382 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 383 + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 384 + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 385 + "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" 386 + "vmovdqa64 %0,%%zmm5\n\t" 387 + "vmovdqa64 %1,%%zmm7\n\t" 388 + "vmovdqa64 %2,%%zmm13\n\t" 389 + "vmovdqa64 %3,%%zmm15\n\t" 390 + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 391 + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 392 + "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" 393 + "vpxorq %%zmm15,%%zmm11,%%zmm11\n" 394 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 395 + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 396 + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 397 + "vpxorq %%zmm15,%%zmm14,%%zmm14" 398 + : 399 + : "m" (dptr[z][d]), "m" (dptr[z][d+64]), 400 + "m" (dptr[z][d+128]), "m" (dptr[z][d+192])); 401 + } 402 + asm volatile("vmovntdq %%zmm2,%0\n\t" 403 + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" 404 + "vmovntdq %%zmm3,%1\n\t" 405 + "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" 406 + "vmovntdq %%zmm10,%2\n\t" 407 + "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" 408 + "vmovntdq %%zmm11,%3\n\t" 409 + "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" 410 + "vmovntdq %%zmm4,%4\n\t" 411 + "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" 412 + "vmovntdq %%zmm6,%5\n\t" 413 + "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" 414 + "vmovntdq %%zmm12,%6\n\t" 415 + "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" 416 + "vmovntdq %%zmm14,%7\n\t" 417 + "vpxorq %%zmm14,%%zmm14,%%zmm14" 418 + : 419 + : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), 420 + "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), 421 + "m" (q[d+128]), "m" (q[d+192])); 422 + } 423 + 424 + asm volatile("sfence" : : : "memory"); 425 + kernel_fpu_end(); 426 + } 427 + 428 + static void raid6_avx5124_xor_syndrome(int disks, int start, int stop, 429 + size_t bytes, void **ptrs) 430 + { 431 + u8 **dptr = (u8 **)ptrs; 432 + u8 *p, *q; 433 + int d, z, z0; 434 + 435 + z0 = stop; /* P/Q right side optimization */ 436 + p = dptr[disks-2]; /* XOR parity */ 437 + q = dptr[disks-1]; /* RS syndrome */ 438 + 439 + kernel_fpu_begin(); 440 + 441 + asm volatile("vmovdqa64 %0,%%zmm0" 442 + :: "m" (raid6_avx512_constants.x1d[0])); 443 + 444 + for (d = 0 ; d < bytes ; d += 256) { 445 + asm volatile("vmovdqa64 %0,%%zmm4\n\t" 446 + "vmovdqa64 %1,%%zmm6\n\t" 447 + "vmovdqa64 %2,%%zmm12\n\t" 448 + "vmovdqa64 %3,%%zmm14\n\t" 449 + "vmovdqa64 %4,%%zmm2\n\t" 450 + "vmovdqa64 %5,%%zmm3\n\t" 451 + "vmovdqa64 %6,%%zmm10\n\t" 452 + "vmovdqa64 %7,%%zmm11\n\t" 453 + "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" 454 + "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t" 455 + "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t" 456 + "vpxorq %%zmm14,%%zmm11,%%zmm11" 457 + : 458 + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), 459 + "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]), 460 + "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), 461 + "m" (p[d+192])); 462 + /* P/Q data pages */ 463 + for (z = z0-1 ; z >= start ; z--) { 464 + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 465 + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 466 + "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" 467 + "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" 468 + "prefetchnta %0\n\t" 469 + "prefetchnta %2\n\t" 470 + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 471 + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 472 + "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" 473 + "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" 474 + "vpmovm2b %%k1,%%zmm5\n\t" 475 + "vpmovm2b %%k2,%%zmm7\n\t" 476 + "vpmovm2b %%k3,%%zmm13\n\t" 477 + "vpmovm2b %%k4,%%zmm15\n\t" 478 + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 479 + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 480 + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" 481 + "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t" 482 + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 483 + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 484 + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" 485 + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" 486 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 487 + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 488 + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 489 + "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" 490 + "vmovdqa64 %0,%%zmm5\n\t" 491 + "vmovdqa64 %1,%%zmm7\n\t" 492 + "vmovdqa64 %2,%%zmm13\n\t" 493 + "vmovdqa64 %3,%%zmm15\n\t" 494 + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 495 + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 496 + "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" 497 + "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t" 498 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 499 + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 500 + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 501 + "vpxorq %%zmm15,%%zmm14,%%zmm14" 502 + : 503 + : "m" (dptr[z][d]), "m" (dptr[z][d+64]), 504 + "m" (dptr[z][d+128]), 505 + "m" (dptr[z][d+192])); 506 + } 507 + asm volatile("prefetchnta %0\n\t" 508 + "prefetchnta %1\n\t" 509 + : 510 + : "m" (q[d]), "m" (q[d+128])); 511 + /* P/Q left side optimization */ 512 + for (z = start-1 ; z >= 0 ; z--) { 513 + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 514 + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 515 + "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" 516 + "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" 517 + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 518 + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 519 + "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" 520 + "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" 521 + "vpmovm2b %%k1,%%zmm5\n\t" 522 + "vpmovm2b %%k2,%%zmm7\n\t" 523 + "vpmovm2b %%k3,%%zmm13\n\t" 524 + "vpmovm2b %%k4,%%zmm15\n\t" 525 + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 526 + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 527 + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" 528 + "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" 529 + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 530 + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 531 + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" 532 + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" 533 + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 534 + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 535 + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 536 + "vpxorq %%zmm15,%%zmm14,%%zmm14" 537 + : 538 + : ); 539 + } 540 + asm volatile("vmovntdq %%zmm2,%0\n\t" 541 + "vmovntdq %%zmm3,%1\n\t" 542 + "vmovntdq %%zmm10,%2\n\t" 543 + "vmovntdq %%zmm11,%3\n\t" 544 + "vpxorq %4,%%zmm4,%%zmm4\n\t" 545 + "vpxorq %5,%%zmm6,%%zmm6\n\t" 546 + "vpxorq %6,%%zmm12,%%zmm12\n\t" 547 + "vpxorq %7,%%zmm14,%%zmm14\n\t" 548 + "vmovntdq %%zmm4,%4\n\t" 549 + "vmovntdq %%zmm6,%5\n\t" 550 + "vmovntdq %%zmm12,%6\n\t" 551 + "vmovntdq %%zmm14,%7" 552 + : 553 + : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), 554 + "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), 555 + "m" (q[d+128]), "m" (q[d+192])); 556 + } 557 + asm volatile("sfence" : : : "memory"); 558 + kernel_fpu_end(); 559 + } 560 + const struct raid6_calls raid6_avx512x4 = { 561 + raid6_avx5124_gen_syndrome, 562 + raid6_avx5124_xor_syndrome, 563 + raid6_have_avx512, 564 + "avx512x4", 565 + 1 /* Has cache hints */ 566 + }; 567 + #endif 568 + 569 + #endif /* CONFIG_AS_AVX512 */
+388
lib/raid6/recov_avx512.c
··· 1 + /* 2 + * Copyright (C) 2016 Intel Corporation 3 + * 4 + * Author: Gayatri Kammela <gayatri.kammela@intel.com> 5 + * Author: Megha Dey <megha.dey@linux.intel.com> 6 + * 7 + * This program is free software; you can redistribute it and/or 8 + * modify it under the terms of the GNU General Public License 9 + * as published by the Free Software Foundation; version 2 10 + * of the License. 11 + * 12 + */ 13 + 14 + #ifdef CONFIG_AS_AVX512 15 + 16 + #include <linux/raid/pq.h> 17 + #include "x86.h" 18 + 19 + static int raid6_has_avx512(void) 20 + { 21 + return boot_cpu_has(X86_FEATURE_AVX2) && 22 + boot_cpu_has(X86_FEATURE_AVX) && 23 + boot_cpu_has(X86_FEATURE_AVX512F) && 24 + boot_cpu_has(X86_FEATURE_AVX512BW) && 25 + boot_cpu_has(X86_FEATURE_AVX512VL) && 26 + boot_cpu_has(X86_FEATURE_AVX512DQ); 27 + } 28 + 29 + static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila, 30 + int failb, void **ptrs) 31 + { 32 + u8 *p, *q, *dp, *dq; 33 + const u8 *pbmul; /* P multiplier table for B data */ 34 + const u8 *qmul; /* Q multiplier table (for both) */ 35 + const u8 x0f = 0x0f; 36 + 37 + p = (u8 *)ptrs[disks-2]; 38 + q = (u8 *)ptrs[disks-1]; 39 + 40 + /* 41 + * Compute syndrome with zero for the missing data pages 42 + * Use the dead data pages as temporary storage for 43 + * delta p and delta q 44 + */ 45 + 46 + dp = (u8 *)ptrs[faila]; 47 + ptrs[faila] = (void *)raid6_empty_zero_page; 48 + ptrs[disks-2] = dp; 49 + dq = (u8 *)ptrs[failb]; 50 + ptrs[failb] = (void *)raid6_empty_zero_page; 51 + ptrs[disks-1] = dq; 52 + 53 + raid6_call.gen_syndrome(disks, bytes, ptrs); 54 + 55 + /* Restore pointer table */ 56 + ptrs[faila] = dp; 57 + ptrs[failb] = dq; 58 + ptrs[disks-2] = p; 59 + ptrs[disks-1] = q; 60 + 61 + /* Now, pick the proper data tables */ 62 + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; 63 + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ 64 + raid6_gfexp[failb]]]; 65 + 66 + kernel_fpu_begin(); 67 + 68 + /* zmm0 = x0f[16] */ 69 + asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); 70 + 71 + while (bytes) { 72 + #ifdef CONFIG_X86_64 73 + asm volatile("vmovdqa64 %0, %%zmm1\n\t" 74 + "vmovdqa64 %1, %%zmm9\n\t" 75 + "vmovdqa64 %2, %%zmm0\n\t" 76 + "vmovdqa64 %3, %%zmm8\n\t" 77 + "vpxorq %4, %%zmm1, %%zmm1\n\t" 78 + "vpxorq %5, %%zmm9, %%zmm9\n\t" 79 + "vpxorq %6, %%zmm0, %%zmm0\n\t" 80 + "vpxorq %7, %%zmm8, %%zmm8" 81 + : 82 + : "m" (q[0]), "m" (q[64]), "m" (p[0]), 83 + "m" (p[64]), "m" (dq[0]), "m" (dq[64]), 84 + "m" (dp[0]), "m" (dp[64])); 85 + 86 + /* 87 + * 1 = dq[0] ^ q[0] 88 + * 9 = dq[64] ^ q[64] 89 + * 0 = dp[0] ^ p[0] 90 + * 8 = dp[64] ^ p[64] 91 + */ 92 + 93 + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" 94 + "vbroadcasti64x2 %1, %%zmm5" 95 + : 96 + : "m" (qmul[0]), "m" (qmul[16])); 97 + 98 + asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" 99 + "vpsraw $4, %%zmm9, %%zmm12\n\t" 100 + "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" 101 + "vpandq %%zmm7, %%zmm9, %%zmm9\n\t" 102 + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" 103 + "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" 104 + "vpshufb %%zmm9, %%zmm4, %%zmm14\n\t" 105 + "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" 106 + "vpshufb %%zmm12, %%zmm5, %%zmm15\n\t" 107 + "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" 108 + "vpxorq %%zmm14, %%zmm15, %%zmm15\n\t" 109 + "vpxorq %%zmm4, %%zmm5, %%zmm5" 110 + : 111 + : ); 112 + 113 + /* 114 + * 5 = qx[0] 115 + * 15 = qx[64] 116 + */ 117 + 118 + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" 119 + "vbroadcasti64x2 %1, %%zmm1\n\t" 120 + "vpsraw $4, %%zmm0, %%zmm2\n\t" 121 + "vpsraw $4, %%zmm8, %%zmm6\n\t" 122 + "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" 123 + "vpandq %%zmm7, %%zmm8, %%zmm14\n\t" 124 + "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" 125 + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" 126 + "vpshufb %%zmm14, %%zmm4, %%zmm12\n\t" 127 + "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" 128 + "vpshufb %%zmm6, %%zmm1, %%zmm13\n\t" 129 + "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" 130 + "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t" 131 + "vpxorq %%zmm12, %%zmm13, %%zmm13" 132 + : 133 + : "m" (pbmul[0]), "m" (pbmul[16])); 134 + 135 + /* 136 + * 1 = pbmul[px[0]] 137 + * 13 = pbmul[px[64]] 138 + */ 139 + asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" 140 + "vpxorq %%zmm15, %%zmm13, %%zmm13" 141 + : 142 + : ); 143 + 144 + /* 145 + * 1 = db = DQ 146 + * 13 = db[64] = DQ[64] 147 + */ 148 + asm volatile("vmovdqa64 %%zmm1, %0\n\t" 149 + "vmovdqa64 %%zmm13,%1\n\t" 150 + "vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" 151 + "vpxorq %%zmm13, %%zmm8, %%zmm8" 152 + : 153 + : "m" (dq[0]), "m" (dq[64])); 154 + 155 + asm volatile("vmovdqa64 %%zmm0, %0\n\t" 156 + "vmovdqa64 %%zmm8, %1" 157 + : 158 + : "m" (dp[0]), "m" (dp[64])); 159 + 160 + bytes -= 128; 161 + p += 128; 162 + q += 128; 163 + dp += 128; 164 + dq += 128; 165 + #else 166 + asm volatile("vmovdqa64 %0, %%zmm1\n\t" 167 + "vmovdqa64 %1, %%zmm0\n\t" 168 + "vpxorq %2, %%zmm1, %%zmm1\n\t" 169 + "vpxorq %3, %%zmm0, %%zmm0" 170 + : 171 + : "m" (*q), "m" (*p), "m"(*dq), "m" (*dp)); 172 + 173 + /* 1 = dq ^ q; 0 = dp ^ p */ 174 + 175 + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" 176 + "vbroadcasti64x2 %1, %%zmm5" 177 + : 178 + : "m" (qmul[0]), "m" (qmul[16])); 179 + 180 + /* 181 + * 1 = dq ^ q 182 + * 3 = dq ^ p >> 4 183 + */ 184 + asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" 185 + "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" 186 + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" 187 + "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" 188 + "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" 189 + "vpxorq %%zmm4, %%zmm5, %%zmm5" 190 + : 191 + : ); 192 + 193 + /* 5 = qx */ 194 + 195 + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" 196 + "vbroadcasti64x2 %1, %%zmm1" 197 + : 198 + : "m" (pbmul[0]), "m" (pbmul[16])); 199 + 200 + asm volatile("vpsraw $4, %%zmm0, %%zmm2\n\t" 201 + "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" 202 + "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" 203 + "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" 204 + "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" 205 + "vpxorq %%zmm4, %%zmm1, %%zmm1" 206 + : 207 + : ); 208 + 209 + /* 1 = pbmul[px] */ 210 + asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" 211 + /* 1 = db = DQ */ 212 + "vmovdqa64 %%zmm1, %0\n\t" 213 + : 214 + : "m" (dq[0])); 215 + 216 + asm volatile("vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" 217 + "vmovdqa64 %%zmm0, %0" 218 + : 219 + : "m" (dp[0])); 220 + 221 + bytes -= 64; 222 + p += 64; 223 + q += 64; 224 + dp += 64; 225 + dq += 64; 226 + #endif 227 + } 228 + 229 + kernel_fpu_end(); 230 + } 231 + 232 + static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila, 233 + void **ptrs) 234 + { 235 + u8 *p, *q, *dq; 236 + const u8 *qmul; /* Q multiplier table */ 237 + const u8 x0f = 0x0f; 238 + 239 + p = (u8 *)ptrs[disks-2]; 240 + q = (u8 *)ptrs[disks-1]; 241 + 242 + /* 243 + * Compute syndrome with zero for the missing data page 244 + * Use the dead data page as temporary storage for delta q 245 + */ 246 + 247 + dq = (u8 *)ptrs[faila]; 248 + ptrs[faila] = (void *)raid6_empty_zero_page; 249 + ptrs[disks-1] = dq; 250 + 251 + raid6_call.gen_syndrome(disks, bytes, ptrs); 252 + 253 + /* Restore pointer table */ 254 + ptrs[faila] = dq; 255 + ptrs[disks-1] = q; 256 + 257 + /* Now, pick the proper data tables */ 258 + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; 259 + 260 + kernel_fpu_begin(); 261 + 262 + asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); 263 + 264 + while (bytes) { 265 + #ifdef CONFIG_X86_64 266 + asm volatile("vmovdqa64 %0, %%zmm3\n\t" 267 + "vmovdqa64 %1, %%zmm8\n\t" 268 + "vpxorq %2, %%zmm3, %%zmm3\n\t" 269 + "vpxorq %3, %%zmm8, %%zmm8" 270 + : 271 + : "m" (dq[0]), "m" (dq[64]), "m" (q[0]), 272 + "m" (q[64])); 273 + 274 + /* 275 + * 3 = q[0] ^ dq[0] 276 + * 8 = q[64] ^ dq[64] 277 + */ 278 + asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" 279 + "vmovapd %%zmm0, %%zmm13\n\t" 280 + "vbroadcasti64x2 %1, %%zmm1\n\t" 281 + "vmovapd %%zmm1, %%zmm14" 282 + : 283 + : "m" (qmul[0]), "m" (qmul[16])); 284 + 285 + asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" 286 + "vpsraw $4, %%zmm8, %%zmm12\n\t" 287 + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" 288 + "vpandq %%zmm7, %%zmm8, %%zmm8\n\t" 289 + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" 290 + "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" 291 + "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" 292 + "vpshufb %%zmm8, %%zmm13, %%zmm13\n\t" 293 + "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" 294 + "vpshufb %%zmm12, %%zmm14, %%zmm14\n\t" 295 + "vpxorq %%zmm0, %%zmm1, %%zmm1\n\t" 296 + "vpxorq %%zmm13, %%zmm14, %%zmm14" 297 + : 298 + : ); 299 + 300 + /* 301 + * 1 = qmul[q[0] ^ dq[0]] 302 + * 14 = qmul[q[64] ^ dq[64]] 303 + */ 304 + asm volatile("vmovdqa64 %0, %%zmm2\n\t" 305 + "vmovdqa64 %1, %%zmm12\n\t" 306 + "vpxorq %%zmm1, %%zmm2, %%zmm2\n\t" 307 + "vpxorq %%zmm14, %%zmm12, %%zmm12" 308 + : 309 + : "m" (p[0]), "m" (p[64])); 310 + 311 + /* 312 + * 2 = p[0] ^ qmul[q[0] ^ dq[0]] 313 + * 12 = p[64] ^ qmul[q[64] ^ dq[64]] 314 + */ 315 + 316 + asm volatile("vmovdqa64 %%zmm1, %0\n\t" 317 + "vmovdqa64 %%zmm14, %1\n\t" 318 + "vmovdqa64 %%zmm2, %2\n\t" 319 + "vmovdqa64 %%zmm12,%3" 320 + : 321 + : "m" (dq[0]), "m" (dq[64]), "m" (p[0]), 322 + "m" (p[64])); 323 + 324 + bytes -= 128; 325 + p += 128; 326 + q += 128; 327 + dq += 128; 328 + #else 329 + asm volatile("vmovdqa64 %0, %%zmm3\n\t" 330 + "vpxorq %1, %%zmm3, %%zmm3" 331 + : 332 + : "m" (dq[0]), "m" (q[0])); 333 + 334 + /* 3 = q ^ dq */ 335 + 336 + asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" 337 + "vbroadcasti64x2 %1, %%zmm1" 338 + : 339 + : "m" (qmul[0]), "m" (qmul[16])); 340 + 341 + asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" 342 + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" 343 + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" 344 + "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" 345 + "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" 346 + "vpxorq %%zmm0, %%zmm1, %%zmm1" 347 + : 348 + : ); 349 + 350 + /* 1 = qmul[q ^ dq] */ 351 + 352 + asm volatile("vmovdqa64 %0, %%zmm2\n\t" 353 + "vpxorq %%zmm1, %%zmm2, %%zmm2" 354 + : 355 + : "m" (p[0])); 356 + 357 + /* 2 = p ^ qmul[q ^ dq] */ 358 + 359 + asm volatile("vmovdqa64 %%zmm1, %0\n\t" 360 + "vmovdqa64 %%zmm2, %1" 361 + : 362 + : "m" (dq[0]), "m" (p[0])); 363 + 364 + bytes -= 64; 365 + p += 64; 366 + q += 64; 367 + dq += 64; 368 + #endif 369 + } 370 + 371 + kernel_fpu_end(); 372 + } 373 + 374 + const struct raid6_recov_calls raid6_recov_avx512 = { 375 + .data2 = raid6_2data_recov_avx512, 376 + .datap = raid6_datap_recov_avx512, 377 + .valid = raid6_has_avx512, 378 + #ifdef CONFIG_X86_64 379 + .name = "avx512x2", 380 + #else 381 + .name = "avx512x1", 382 + #endif 383 + .priority = 3, 384 + }; 385 + 386 + #else 387 + #warning "your version of binutils lacks AVX512 support" 388 + #endif
+4 -1
lib/raid6/test/Makefile
··· 32 32 endif 33 33 34 34 ifeq ($(IS_X86),yes) 35 - OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o 35 + OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o 36 36 CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \ 37 37 gcc -c -x assembler - >&/dev/null && \ 38 38 rm ./-.o && echo -DCONFIG_AS_AVX2=1) 39 + CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \ 40 + gcc -c -x assembler - >&/dev/null && \ 41 + rm ./-.o && echo -DCONFIG_AS_AVX512=1) 39 42 else ifeq ($(HAS_NEON),yes) 40 43 OBJS += neon.o neon1.o neon2.o neon4.o neon8.o 41 44 CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1
+4 -3
lib/raid6/test/test.c
··· 21 21 22 22 #define NDISKS 16 /* Including P and Q */ 23 23 24 - const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); 24 + const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); 25 25 struct raid6_calls raid6_call; 26 26 27 27 char *dataptrs[NDISKS]; 28 - char data[NDISKS][PAGE_SIZE]; 29 - char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; 28 + char data[NDISKS][PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); 29 + char recovi[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); 30 + char recovj[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); 30 31 31 32 static void makedata(int start, int stop) 32 33 {
+10
lib/raid6/x86.h
··· 46 46 #define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ 47 47 #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ 48 48 #define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ 49 + #define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */ 50 + #define X86_FEATURE_AVX512DQ (9*32+17) /* AVX-512 DQ (Double/Quad granular) 51 + * Instructions 52 + */ 53 + #define X86_FEATURE_AVX512BW (9*32+30) /* AVX-512 BW (Byte/Word granular) 54 + * Instructions 55 + */ 56 + #define X86_FEATURE_AVX512VL (9*32+31) /* AVX-512 VL (128/256 Vector Length) 57 + * Extensions 58 + */ 49 59 #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ 50 60 51 61 /* Should work well enough on modern CPUs for testing */