riscv: module: Optimize PLT/GOT entry counting

perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent
inside module_frob_arch_sections(). This is because amdgpu.ko contains
about 300000 relocations in its .rela.text section, and the algorithm in
count_max_entries() takes quadratic time.

Apply two optimizations from the arm64 code, which together reduce the
total execution time by 99.58%. First, sort the relocations so duplicate
entries are adjacent. Second, reduce the number of relocations that must
be sorted by filtering to only relocations that need PLT/GOT entries, as
done in commit d4e0340919 ("arm64/module: Optimize module load time by
optimizing PLT counting").

Unlike the arm64 code, here the filtering and sorting is done in a
scratch buffer, because the HI20 relocation search optimization in
apply_relocate_add() depends on the original order of the relocations.
This allows accumulating PLT/GOT relocations across sections so sorting
and counting is only done once per module.

Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20250409171526.862481-3-samuel.holland@sifive.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
This commit is contained in:
Samuel Holland 2025-04-09 10:14:51 -07:00 committed by Palmer Dabbelt
parent 881dadf079
commit be17c0df67
No known key found for this signature in database
GPG Key ID: 2E1319F35FBB1889
1 changed files with 65 additions and 16 deletions

View File

@ -9,6 +9,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleloader.h>
#include <linux/sort.h>
unsigned long module_emit_got_entry(struct module *mod, unsigned long val)
{
@ -55,44 +56,70 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val)
return (unsigned long)&plt[i];
}
static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b))
static int cmp_rela(const void *a, const void *b)
{
return x->r_info == y->r_info && x->r_addend == y->r_addend;
const Elf_Rela *x = a, *y = b;
int i;
/* sort by type, symbol index and addend */
i = cmp_3way(x->r_info, y->r_info);
if (i == 0)
i = cmp_3way(x->r_addend, y->r_addend);
return i;
}
static bool duplicate_rela(const Elf_Rela *rela, int idx)
{
int i;
for (i = 0; i < idx; i++) {
if (is_rela_equal(&rela[i], &rela[idx]))
return true;
}
return false;
/*
* Entries are sorted by type, symbol index and addend. That means
* that, if a duplicate entry exists, it must be in the preceding slot.
*/
return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0;
}
static void count_max_entries(Elf_Rela *relas, int num,
static void count_max_entries(const Elf_Rela *relas, size_t num,
unsigned int *plts, unsigned int *gots)
{
for (int i = 0; i < num; i++) {
for (size_t i = 0; i < num; i++) {
if (duplicate_rela(relas, i))
continue;
switch (ELF_R_TYPE(relas[i].r_info)) {
case R_RISCV_CALL_PLT:
case R_RISCV_PLT32:
if (!duplicate_rela(relas, i))
(*plts)++;
(*plts)++;
break;
case R_RISCV_GOT_HI20:
if (!duplicate_rela(relas, i))
(*gots)++;
(*gots)++;
break;
default:
unreachable();
}
}
}
static bool rela_needs_plt_got_entry(const Elf_Rela *rela)
{
switch (ELF_R_TYPE(rela->r_info)) {
case R_RISCV_CALL_PLT:
case R_RISCV_GOT_HI20:
case R_RISCV_PLT32:
return true;
default:
return false;
}
}
int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
char *secstrings, struct module *mod)
{
size_t num_scratch_relas = 0;
unsigned int num_plts = 0;
unsigned int num_gots = 0;
Elf_Rela *scratch = NULL;
size_t scratch_size = 0;
int i;
/*
@ -122,9 +149,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
/* Calculate the maxinum number of entries */
for (i = 0; i < ehdr->e_shnum; i++) {
size_t num_relas = sechdrs[i].sh_size / sizeof(Elf_Rela);
Elf_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset;
int num_rela = sechdrs[i].sh_size / sizeof(Elf_Rela);
Elf_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info;
size_t scratch_size_needed;
if (sechdrs[i].sh_type != SHT_RELA)
continue;
@ -133,7 +161,28 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
if (!(dst_sec->sh_flags & SHF_EXECINSTR))
continue;
count_max_entries(relas, num_rela, &num_plts, &num_gots);
/*
* apply_relocate_add() relies on HI20 and LO12 relocation pairs being
* close together, so sort a copy of the section to avoid interfering.
*/
scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch);
if (scratch_size_needed > scratch_size) {
scratch_size = scratch_size_needed;
scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL);
if (!scratch)
return -ENOMEM;
}
for (size_t j = 0; j < num_relas; j++)
if (rela_needs_plt_got_entry(&relas[j]))
scratch[num_scratch_relas++] = relas[j];
}
if (scratch) {
/* sort the accumulated PLT/GOT relocations so duplicates are adjacent */
sort(scratch, num_scratch_relas, sizeof(*scratch), cmp_rela, NULL);
count_max_entries(scratch, num_scratch_relas, &num_plts, &num_gots);
kvfree(scratch);
}
mod->arch.plt.shdr->sh_type = SHT_NOBITS;