SV DPI exported function as co-routine, is it LRM compliant?

I want to implement python type of generator in Systemverilog. DPI and C ucontext functions get|make|swapcontext are used.

In the C code, an another stack was created to run SV exported function. And the exported function will NOT run to the end but yield and its context was saved and will be resumed later. This way, multiple user stacks were created running multiple instances of SV exported function in a interleaved way, controlled by the master thread from SV which might have delay control statements.

This piece of code does work. However, it seems to break the SV function semantics. That is, function call will NOT cost time and always run to the end. Could anyone please help to confirm?

SV source:

package xxx_gen_pkg;                                                                                                                      

import "DPI-C" context function chandle xxx_co_create (int gen_id);
import "DPI-C" context function void    xxx_co_destory(chandle cor);
import "DPI-C" context function void    xxx_co_resume (chandle cor);
import "DPI-C" context function void    xxx_co_yield  ();           

export "DPI-C" function run_gen;

typedef class xxx_gen_base;
xxx_gen_base gen_table[int];

function void run_gen(int gen_id);
    gen_table[gen_id].body;       
endfunction                       

class xxx_cmd;
    int cmn_data;
    static function xxx_cmd create;
        xxx_cmd cmd = new;         
        cmd.cmn_data = $urandom;   
        return cmd;                
    endfunction                    
endclass: xxx_cmd                  

virtual class xxx_gen_base;
    local static int glbl_gen_id = 0;
    local int gen_id;
    local chandle gen_cor;
    local xxx_cmd yield_val_buf[$];
    // this is the generator function
    pure virtual function void body;
    function new;
        gen_id = glbl_gen_id++;
        gen_table[gen_id] = this;
        gen_cor = xxx_co_create(gen_id);
    endfunction
    function void destroy;
        xxx_co_destory(gen_cor);
        gen_cor = null;
    endfunction
    function xxx_cmd next;
        xxx_co_resume(gen_cor);
        return yield_val_buf.pop_front;
    endfunction
    function void yield(xxx_cmd cmd);
        yield_val_buf.push_back(cmd);
        xxx_co_yield();
    endfunction
    function int get_gen_id; return gen_id; endfunction
endclass:xxx_gen_base

endpackage

module tb;
import xxx_gen_pkg::*;
class my_gen extends xxx_gen_base;
    function void body();
        repeat(1000) yield(xxx_cmd::create());
    endfunction: body
endclass: my_gen

initial begin
    automatic my_gen gen0 = new, gen1 = new;
    $display("start to run\n");
    repeat(10) begin
        my_gen tmp_gen;
        $display("%0d",$time,,,"gen%0d yield %0d\n", gen0.get_gen_id, gen0.next().cmn_data);
        $display("%0d",$time,,,"gen%0d yield %0d\n", gen1.get_gen_id, gen1.next().cmn_data);
        tmp_gen = gen0; gen0 = gen1; gen1 = tmp_gen;
        #10;
    end
    $display("finished");
    gen0.destroy; gen1.destroy;
end

endmodule

C source:

#include < stdlib.h >                                    
#include < assert.h>                                    
#include < ucontext.h>                                  
#include < string.h>                                    
#include "sv_vpi_user.h"                               

#ifdef __cplusplus
extern "C" {      
#endif            

    typedef struct cor_s {
        ucontext_t ctx;   
        int gen_id;       
        int stack[0];     
    } cor_t, *cor_p;      

    cor_p xxx_co_create(int gen_id);
    void xxx_co_destory(cor_p cor); 
    void xxx_co_resume(cor_p cor);  
    void xxx_co_yield();            
    void run_gen(int gen_id);       

#ifdef __cplusplus
}                 
#endif            

#define stk_sz (4 * 1024)
#define cor_sz (sizeof(cor_t) + stk_sz)
static cor_t main_cor;                 
static cor_p act_cor;                  

static void cor_entry()
{                      
    assert(act_cor != NULL);
    run_gen(act_cor->gen_id);
    assert(0); //NYI         
}                            

cor_p xxx_co_create(int gen_id)
{                              
    cor_p cor = (cor_p)malloc(cor_sz);
    assert(cor != NULL);
    assert(getcontext(&cor->ctx) == 0);

        cor->ctx.uc_link = NULL;
        cor->ctx.uc_stack.ss_sp = cor->stack;
        cor->ctx.uc_stack.ss_size = stk_sz;
        cor->ctx.uc_stack.ss_flags = 0;
    cor->gen_id = gen_id;

    cor->stack[0] = 0x0badc0de;
    cor->stack[1] = 0xdeadbeaf;
    cor->stack[2] = 0xcafebabe;
    cor->stack[2] = 0x55555555;
    cor->stack[3] = 0xaaaaaaaa;
    cor->stack[4] = 0x11110000;
    cor->stack[5] = 0x00001111;

    makecontext(&cor->ctx, cor_entry, 0);
    return cor;
}

void xxx_co_destory(cor_p cor)
{
    free(cor);
}

void xxx_co_resume(cor_p cor)
{
    vpi_printf("resume to coroutine gen%0d\n", cor->gen_id);
    act_cor = cor;
    assert(swapcontext(&main_cor.ctx, &cor->ctx) == 0);
    vpi_printf("resume from coroutine gen%0d\n", cor->gen_id);
}

register char* stk_ptr asm("rsp");

void xxx_co_yield()
{
    assert(act_cor->stack[0] = 0x0badc0de);
    assert(act_cor->stack[1] = 0xdeadbeaf);
    assert(act_cor->stack[2] = 0xcafebabe);
    assert(act_cor->stack[2] = 0x55555555);
    assert(act_cor->stack[3] = 0xaaaaaaaa);
    assert(act_cor->stack[4] = 0x11110000);
    assert(act_cor->stack[5] = 0x00001111);
    vpi_printf("yield coroutine gen%0d\n", act_cor->gen_id);
    assert(swapcontext(&act_cor->ctx, &main_cor.ctx) == 0);
    vpi_printf("resumed coroutine gen%0d\n", act_cor->gen_id);
}

In reply to robert.liu:

I don’t have the time to review your code in detail, but it seems you are confusing the concept of CPU time to execute C software in the form of a function with simulation time, which is a hardware modeling abstraction. See my DVCon paper “Easy Steps Towards Virtual Prototyping using the SystemVerilog DPI” for some background.

In reply to dave_59:

The concept of the code is to try to make the exported SV function to work like python generator.

This is the python generator:

1 # a generator that yields items instead of returning a list
   2 def firstn(n):
   3     num = 0
   4     while num < n:
   5         yield num
   6         num += 1

In my example:

function void body();
        repeat(1000) yield(xxx_cmd::create());
    endfunction: body

    function void yield(xxx_cmd cmd);
        yield_val_buf.push_back(cmd);
        xxx_co_yield();
    endfunction

import "DPI-C" context function void    xxx_co_yield  ();           

void xxx_co_yield()
{
...
    assert(swapcontext(&act_cor->ctx, &main_cor.ctx) == 0);
...
}

Here swapcontextexplicitly switch the stack used for the SV exported function and C imported function.

Note that the single-threading semantics of the verilog simulation isn’t broken in the example. However, the difference than the normal DPI application is in the example, they are multiple stacks and get|make|swapcontext standard C functions to implement user level cooperative multi-task. It’s like the sc_cor in SystemC to support the sc_thread.

In reply to robert.liu:

And the SV exported function is NOT called by SV main thread in the same stack through the call chain SV → DPI C imported function → SV exported function. There is stack switch again.

initial begin // **this is the main thread, in real TB it would be uvm_sequence::body or uvm_component::run_phase**
...
    $display("%0d",$time,,,"gen%0d yield %0d\n", gen0.get_gen_id, **gen0.next()**.cmn_data);
...
end

    function xxx_cmd next;
        **xxx_co_resume(gen_cor);**
        return yield_val_buf.pop_front;
    endfunction 

import "DPI-C" context function void    xxx_co_resume (chandle cor);

void xxx_co_resume(cor_p cor)
{
    vpi_printf("resume to coroutine gen%0d\n", cor->gen_id);
    act_cor = cor;
    assert(**swapcontext**(&main_cor.ctx, &cor->ctx) == 0);
    vpi_printf("resume from coroutine gen%0d\n", cor->gen_id);
}

We can have one or more generators in the test. (2 in the example code).
Then we can call any of those generators at any time if we need a transaction from it.

It is similar to running multiple uvm_sequences on a sequencer and uvm_driver calling get_next_item. The difference is that the generator approach is more controllable in two aspects:

  1. we can precise control which generator to generate next transaction item.
  2. more importantly, it is single SV thread (only main thread) in the example code. The SV debugger can “jump” from main thread (from xxx_co_resume) into the middle of SV exported function precisely(just after the feg_co_yield function call statement). With thousands of generators in the testbench, our debugging effort is minimized.

In reply to robert.liu:

Hope my explanation above is clear.

Just would like to know if this kind of DPI usage is permitted by the LRM. After all, I didn’t see any similar example that a separate user level stack is allocated and SV exported function runs on it, rather than on the stack managed by the simulator kernel. (that is the stack used by main thread and DPI C imported function).

In reply to dave_59:

Hi Dave, I got some bandwidth to study the LRM DPI part carefully. I found two interesting parapgrahs:

35.5.3 Context tasks and functions

— Detecting when control moves across the language boundary between SystemVerilog and an
imported language is critical for simulators managing DPI context. Therefore, if user code
circumvents unwinding an export call chain back to its import chain caller (e.g., by using C setjmp/
longjmp constructs), the results are undefined.

H.9.2 Context of imported and exported tasks and functions

When control passes across the boundary between SystemVerilog and a DPI import call chain with the
context property, the value of the import’s context is potentially either set or reset (see 35.5.3). Therefore,
user code behavior is undefined for DPI import C code that circumvents SystemVerilog exports unwinding
across the boundary to their import caller (e.g., by using C setjmp and longjmp constructs).

According to setjmp, A less common use of setjmp is to create syntax similar to coroutines.

In my example, I used swapcontext which is similar to setjmp/longjmp. However, the execution jumps from imported C function to exported function and then always jumps back from exported function to imported function. Then imported function returns back to the SV caller.

Does this code break the LRM rules?

In reply to robert.liu:

It certainly violates the spirit of the LRM since swapcontext is considered an advance form of setjmp.

I think we may be stepping into the XY problem. Without knowing what got you here in the first place, it’s difficult to suggest a better approach. For example, you might be able to use C++ iterators without any need for stack manipulation.

In reply to dave_59:

Ok. Here is the background.

For our design, there is a need to spawn thousands of uvm_ssquences working on top of several interfaces. Those sequences don’t work independently. We need a “scheduler” or “dispatcher” between those sequences and interfaces. Specifically, we have several expectations:

  1. the scheduler can decide which sequence should generate/send next transaction item. This can be done by defining a new API including semaphore and `uvm_do in uvm_sequence.
  2. we want to debug the complicated logic inside the sequence easily. That is, we want the execution to be able to switch from the scheduler to the sequence being selected like a function call. For example, we have a sequence:
task body()
...
code block 0 to generate item0;
send_item_to_sched(item0);
code block 1 to generate item1;
send_item_to_sched(item1);
...
endtask
scheduler:
...
seqX.get_next_item();
...
seqY.get_next_item();
...

Since the sequence logic would very complicated, we can’t know easily where to setup the break-point. So we need a simple way for the execution to be able to “jump” from scheduler to code block 0 or 1 directly like a function call.

The python kind of generator(a kind of coroutine) meet our requirement. The coroutine will save and restore the calling scenario and continue execution from the “yield” point. And we just “call” those generators like functions.

So I discarded the plan of using (thousands of) uvm_sequence but implement a co-routine approach using DPI. And it really works on Questasim.

In reply to dave_59:

This article python generator discusses iterators and generators. I really hope SV can have native support for the python generator construct. C++20 also begins to support coroutines.C++20 coroutine

Although SV process is not the same type of OS thread, the property in common is user can’t switch execution from process 0 to process 1 explicitly in the same way as function call. This make a lot of difficulty in debugging sometimes.