linux/kernel/trace/simple_ring_buffer.c

518 lines
14 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2025 - Google LLC
* Author: Vincent Donnefort <vdonnefort@google.com>
*/
#include <linux/atomic.h>
#include <linux/simple_ring_buffer.h>
#include <asm/barrier.h>
#include <asm/local.h>
enum simple_rb_link_type {
SIMPLE_RB_LINK_NORMAL = 0,
SIMPLE_RB_LINK_HEAD = 1,
SIMPLE_RB_LINK_HEAD_MOVING
};
#define SIMPLE_RB_LINK_MASK ~(SIMPLE_RB_LINK_HEAD | SIMPLE_RB_LINK_HEAD_MOVING)
static void simple_bpage_set_head_link(struct simple_buffer_page *bpage)
{
unsigned long link = (unsigned long)bpage->link.next;
link &= SIMPLE_RB_LINK_MASK;
link |= SIMPLE_RB_LINK_HEAD;
/*
* Paired with simple_rb_find_head() to order access between the head
* link and overrun. It ensures we always report an up-to-date value
* after swapping the reader page.
*/
smp_store_release(&bpage->link.next, (struct list_head *)link);
}
static bool simple_bpage_unset_head_link(struct simple_buffer_page *bpage,
struct simple_buffer_page *dst,
enum simple_rb_link_type new_type)
{
unsigned long *link = (unsigned long *)(&bpage->link.next);
unsigned long old = (*link & SIMPLE_RB_LINK_MASK) | SIMPLE_RB_LINK_HEAD;
unsigned long new = (unsigned long)(&dst->link) | new_type;
return try_cmpxchg(link, &old, new);
}
static void simple_bpage_set_normal_link(struct simple_buffer_page *bpage)
{
unsigned long link = (unsigned long)bpage->link.next;
WRITE_ONCE(bpage->link.next, (struct list_head *)(link & SIMPLE_RB_LINK_MASK));
}
static struct simple_buffer_page *simple_bpage_from_link(struct list_head *link)
{
unsigned long ptr = (unsigned long)link & SIMPLE_RB_LINK_MASK;
return container_of((struct list_head *)ptr, struct simple_buffer_page, link);
}
static struct simple_buffer_page *simple_bpage_next_page(struct simple_buffer_page *bpage)
{
return simple_bpage_from_link(bpage->link.next);
}
static void simple_bpage_reset(struct simple_buffer_page *bpage)
{
bpage->write = 0;
bpage->entries = 0;
local_set(&bpage->page->commit, 0);
}
static void simple_bpage_init(struct simple_buffer_page *bpage, void *page)
{
INIT_LIST_HEAD(&bpage->link);
bpage->page = (struct buffer_data_page *)page;
simple_bpage_reset(bpage);
}
#define simple_rb_meta_inc(__meta, __inc) \
WRITE_ONCE((__meta), (__meta + __inc))
static bool simple_rb_loaded(struct simple_rb_per_cpu *cpu_buffer)
{
return !!cpu_buffer->bpages;
}
static int simple_rb_find_head(struct simple_rb_per_cpu *cpu_buffer)
{
int retry = cpu_buffer->nr_pages * 2;
struct simple_buffer_page *head;
head = cpu_buffer->head_page;
while (retry--) {
unsigned long link;
spin:
/* See smp_store_release in simple_bpage_set_head_link() */
link = (unsigned long)smp_load_acquire(&head->link.prev->next);
switch (link & ~SIMPLE_RB_LINK_MASK) {
/* Found the head */
case SIMPLE_RB_LINK_HEAD:
cpu_buffer->head_page = head;
return 0;
/* The writer caught the head, we can spin, that won't be long */
case SIMPLE_RB_LINK_HEAD_MOVING:
goto spin;
}
head = simple_bpage_next_page(head);
}
return -EBUSY;
}
/**
* simple_ring_buffer_swap_reader_page - Swap ring-buffer head with the reader
* @cpu_buffer: A simple_rb_per_cpu
*
* This function enables consuming reading. It ensures the current head page will not be overwritten
* and can be safely read.
*
* Returns 0 on success, -ENODEV if @cpu_buffer was unloaded or -EBUSY if we failed to catch the
* head page.
*/
int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer)
{
struct simple_buffer_page *last, *head, *reader;
unsigned long overrun;
int retry = 8;
int ret;
if (!simple_rb_loaded(cpu_buffer))
return -ENODEV;
reader = cpu_buffer->reader_page;
do {
/* Run after the writer to find the head */
ret = simple_rb_find_head(cpu_buffer);
if (ret)
return ret;
head = cpu_buffer->head_page;
/* Connect the reader page around the header page */
reader->link.next = head->link.next;
reader->link.prev = head->link.prev;
/* The last page before the head */
last = simple_bpage_from_link(head->link.prev);
/* The reader page points to the new header page */
simple_bpage_set_head_link(reader);
overrun = cpu_buffer->meta->overrun;
} while (!simple_bpage_unset_head_link(last, reader, SIMPLE_RB_LINK_NORMAL) && retry--);
if (!retry)
return -EINVAL;
cpu_buffer->head_page = simple_bpage_from_link(reader->link.next);
cpu_buffer->head_page->link.prev = &reader->link;
cpu_buffer->reader_page = head;
cpu_buffer->meta->reader.lost_events = overrun - cpu_buffer->last_overrun;
cpu_buffer->meta->reader.id = cpu_buffer->reader_page->id;
cpu_buffer->last_overrun = overrun;
return 0;
}
EXPORT_SYMBOL_GPL(simple_ring_buffer_swap_reader_page);
static struct simple_buffer_page *simple_rb_move_tail(struct simple_rb_per_cpu *cpu_buffer)
{
struct simple_buffer_page *tail, *new_tail;
tail = cpu_buffer->tail_page;
new_tail = simple_bpage_next_page(tail);
if (simple_bpage_unset_head_link(tail, new_tail, SIMPLE_RB_LINK_HEAD_MOVING)) {
/*
* Oh no! we've caught the head. There is none anymore and
* swap_reader will spin until we set the new one. Overrun must
* be written first, to make sure we report the correct number
* of lost events.
*/
simple_rb_meta_inc(cpu_buffer->meta->overrun, new_tail->entries);
simple_rb_meta_inc(cpu_buffer->meta->pages_lost, 1);
simple_bpage_set_head_link(new_tail);
simple_bpage_set_normal_link(tail);
}
simple_bpage_reset(new_tail);
cpu_buffer->tail_page = new_tail;
simple_rb_meta_inc(cpu_buffer->meta->pages_touched, 1);
return new_tail;
}
static unsigned long rb_event_size(unsigned long length)
{
struct ring_buffer_event *event;
return length + RB_EVNT_HDR_SIZE + sizeof(event->array[0]);
}
static struct ring_buffer_event *
rb_event_add_ts_extend(struct ring_buffer_event *event, u64 delta)
{
event->type_len = RINGBUF_TYPE_TIME_EXTEND;
event->time_delta = delta & TS_MASK;
event->array[0] = delta >> TS_SHIFT;
return (struct ring_buffer_event *)((unsigned long)event + 8);
}
static struct ring_buffer_event *
simple_rb_reserve_next(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, u64 timestamp)
{
unsigned long ts_ext_size = 0, event_size = rb_event_size(length);
struct simple_buffer_page *tail = cpu_buffer->tail_page;
struct ring_buffer_event *event;
u32 write, prev_write;
u64 time_delta;
time_delta = timestamp - cpu_buffer->write_stamp;
if (test_time_stamp(time_delta))
ts_ext_size = 8;
prev_write = tail->write;
write = prev_write + event_size + ts_ext_size;
if (unlikely(write > (PAGE_SIZE - BUF_PAGE_HDR_SIZE)))
tail = simple_rb_move_tail(cpu_buffer);
if (!tail->entries) {
tail->page->time_stamp = timestamp;
time_delta = 0;
ts_ext_size = 0;
write = event_size;
prev_write = 0;
}
tail->write = write;
tail->entries++;
cpu_buffer->write_stamp = timestamp;
event = (struct ring_buffer_event *)(tail->page->data + prev_write);
if (ts_ext_size) {
event = rb_event_add_ts_extend(event, time_delta);
time_delta = 0;
}
event->type_len = 0;
event->time_delta = time_delta;
event->array[0] = event_size - RB_EVNT_HDR_SIZE;
return event;
}
/**
* simple_ring_buffer_reserve - Reserve an entry in @cpu_buffer
* @cpu_buffer: A simple_rb_per_cpu
* @length: Size of the entry in bytes
* @timestamp: Timestamp of the entry
*
* Returns the address of the entry where to write data or NULL
*/
void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length,
u64 timestamp)
{
struct ring_buffer_event *rb_event;
if (cmpxchg(&cpu_buffer->status, SIMPLE_RB_READY, SIMPLE_RB_WRITING) != SIMPLE_RB_READY)
return NULL;
rb_event = simple_rb_reserve_next(cpu_buffer, length, timestamp);
return &rb_event->array[1];
}
EXPORT_SYMBOL_GPL(simple_ring_buffer_reserve);
/**
* simple_ring_buffer_commit - Commit the entry reserved with simple_ring_buffer_reserve()
* @cpu_buffer: The simple_rb_per_cpu where the entry has been reserved
*/
void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer)
{
local_set(&cpu_buffer->tail_page->page->commit,
cpu_buffer->tail_page->write);
simple_rb_meta_inc(cpu_buffer->meta->entries, 1);
/*
* Paired with simple_rb_enable_tracing() to ensure data is
* written to the ring-buffer before teardown.
*/
smp_store_release(&cpu_buffer->status, SIMPLE_RB_READY);
}
EXPORT_SYMBOL_GPL(simple_ring_buffer_commit);
static u32 simple_rb_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable)
{
u32 prev_status;
if (enable)
return cmpxchg(&cpu_buffer->status, SIMPLE_RB_UNAVAILABLE, SIMPLE_RB_READY);
/* Wait for the buffer to be released */
do {
prev_status = cmpxchg_acquire(&cpu_buffer->status,
SIMPLE_RB_READY,
SIMPLE_RB_UNAVAILABLE);
} while (prev_status == SIMPLE_RB_WRITING);
return prev_status;
}
/**
* simple_ring_buffer_reset - Reset @cpu_buffer
* @cpu_buffer: A simple_rb_per_cpu
*
* This will not clear the content of the data, only reset counters and pointers
*
* Returns 0 on success or -ENODEV if @cpu_buffer was unloaded.
*/
int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer)
{
struct simple_buffer_page *bpage;
u32 prev_status;
int ret;
if (!simple_rb_loaded(cpu_buffer))
return -ENODEV;
prev_status = simple_rb_enable_tracing(cpu_buffer, false);
ret = simple_rb_find_head(cpu_buffer);
if (ret)
return ret;
bpage = cpu_buffer->tail_page = cpu_buffer->head_page;
do {
simple_bpage_reset(bpage);
bpage = simple_bpage_next_page(bpage);
} while (bpage != cpu_buffer->head_page);
simple_bpage_reset(cpu_buffer->reader_page);
cpu_buffer->last_overrun = 0;
cpu_buffer->write_stamp = 0;
cpu_buffer->meta->reader.read = 0;
cpu_buffer->meta->reader.lost_events = 0;
cpu_buffer->meta->entries = 0;
cpu_buffer->meta->overrun = 0;
cpu_buffer->meta->read = 0;
cpu_buffer->meta->pages_lost = 0;
cpu_buffer->meta->pages_touched = 0;
if (prev_status == SIMPLE_RB_READY)
simple_rb_enable_tracing(cpu_buffer, true);
return 0;
}
EXPORT_SYMBOL_GPL(simple_ring_buffer_reset);
int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer,
struct simple_buffer_page *bpages,
const struct ring_buffer_desc *desc,
void *(*load_page)(unsigned long va),
void (*unload_page)(void *va))
{
struct simple_buffer_page *bpage = bpages;
int ret = 0;
void *page;
int i;
/* At least 1 reader page and two pages in the ring-buffer */
if (desc->nr_page_va < 3)
return -EINVAL;
memset(cpu_buffer, 0, sizeof(*cpu_buffer));
cpu_buffer->meta = load_page(desc->meta_va);
if (!cpu_buffer->meta)
return -EINVAL;
memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta));
cpu_buffer->meta->meta_page_size = PAGE_SIZE;
cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages;
/* The reader page is not part of the ring initially */
page = load_page(desc->page_va[0]);
if (!page) {
unload_page(cpu_buffer->meta);
return -EINVAL;
}
simple_bpage_init(bpage, page);
bpage->id = 0;
cpu_buffer->nr_pages = 1;
cpu_buffer->reader_page = bpage;
cpu_buffer->tail_page = bpage + 1;
cpu_buffer->head_page = bpage + 1;
for (i = 1; i < desc->nr_page_va; i++) {
page = load_page(desc->page_va[i]);
if (!page) {
ret = -EINVAL;
break;
}
simple_bpage_init(++bpage, page);
bpage->link.next = &(bpage + 1)->link;
bpage->link.prev = &(bpage - 1)->link;
bpage->id = i;
cpu_buffer->nr_pages = i + 1;
}
if (ret) {
for (i--; i >= 0; i--)
unload_page((void *)desc->page_va[i]);
unload_page(cpu_buffer->meta);
return ret;
}
/* Close the ring */
bpage->link.next = &cpu_buffer->tail_page->link;
cpu_buffer->tail_page->link.prev = &bpage->link;
/* The last init'ed page points to the head page */
simple_bpage_set_head_link(bpage);
cpu_buffer->bpages = bpages;
return 0;
}
static void *__load_page(unsigned long page)
{
return (void *)page;
}
static void __unload_page(void *page) { }
/**
* simple_ring_buffer_init - Init @cpu_buffer based on @desc
* @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller.
* @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va
* @desc: A ring_buffer_desc
*
* Returns 0 on success or -EINVAL if the content of @desc is invalid
*/
int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages,
const struct ring_buffer_desc *desc)
{
return simple_ring_buffer_init_mm(cpu_buffer, bpages, desc, __load_page, __unload_page);
}
EXPORT_SYMBOL_GPL(simple_ring_buffer_init);
void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer,
void (*unload_page)(void *))
{
int p;
if (!simple_rb_loaded(cpu_buffer))
return;
simple_rb_enable_tracing(cpu_buffer, false);
unload_page(cpu_buffer->meta);
for (p = 0; p < cpu_buffer->nr_pages; p++)
unload_page(cpu_buffer->bpages[p].page);
cpu_buffer->bpages = NULL;
}
/**
* simple_ring_buffer_unload - Prepare @cpu_buffer for deletion
* @cpu_buffer: A simple_rb_per_cpu that will be deleted.
*/
void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer)
{
return simple_ring_buffer_unload_mm(cpu_buffer, __unload_page);
}
EXPORT_SYMBOL_GPL(simple_ring_buffer_unload);
/**
* simple_ring_buffer_enable_tracing - Enable or disable writing to @cpu_buffer
* @cpu_buffer: A simple_rb_per_cpu
* @enable: True to enable tracing, False to disable it
*
* Returns 0 on success or -ENODEV if @cpu_buffer was unloaded
*/
int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable)
{
if (!simple_rb_loaded(cpu_buffer))
return -ENODEV;
simple_rb_enable_tracing(cpu_buffer, enable);
return 0;
}
EXPORT_SYMBOL_GPL(simple_ring_buffer_enable_tracing);