1 //
2 // Copyright (c) 2021 Travis Geiselbrecht
3 //
4 // Use of this source code is governed by a MIT-style
5 // license that can be found in the LICENSE file or at
6 // https://opensource.org/licenses/MIT
7 
8 #include <arch/atomic.h>
9 #include <lk/init.h>
10 #include <lk/err.h>
11 #include <lk/cpp.h>
12 #include <lk/trace.h>
13 #include <lk/list.h>
14 #include <dev/bus/pci.h>
15 #include <kernel/event.h>
16 #include <kernel/thread.h>
17 #include <kernel/vm.h>
18 #include <lib/minip.h>
19 #include <lib/pktbuf.h>
20 #include <string.h>
21 #include <platform/interrupts.h>
22 #include <type_traits>
23 
24 #include "e1000_hw.h"
25 
26 #define LOCAL_TRACE 0
27 
28 class e1000;
29 static e1000 *the_e; // XXX hack to remember the first e1000 seen and use for minip
30 
31 // list of known 8086:x e1000 devices to match against
32 struct e1000_id_features {
33     uint16_t id;
34     bool e1000e;
35 };
36 
37 const e1000_id_features e1000_ids[] = {
38     { 0x100c, false }, // 82544GC QEMU 'e1000-82544gc'
39     { 0x100e, false }, // 82540EM QEMU 'e1000'
40     { 0x100f, false }, // 82545EM QEMU 'e1000-82544em'
41     { 0x10d3, true }, // 82574L  QEMU 'e1000e'
42     { 0x1533, true }, // i210
43 };
44 
45 // i210 ids
46 // 0x1533
47 // 0x1536
48 // 0x1537
49 // 0x1538
50 
51 // i219 ids
52 // 0x156f
53 // 0x1570
54 //    soc integrated versions?
55 // 0x1a1c // i219-LM (17)
56 // 0x1a1d // i219-V  (17)
57 // 0x1a1e // i219-LM (16)
58 // 0x1a1f // i219-V  (16)
59 
60 
61 class e1000 {
62 public:
63     e1000();
64     ~e1000();
65 
66     status_t init_device(pci_location_t loc, const e1000_id_features *id);
67 
68     int tx(pktbuf_t *p);
69 
is_e1000e() const70     bool is_e1000e() const { return id_feat_->e1000e; }
71 
mac_addr() const72     const uint8_t *mac_addr() const { return mac_addr_; }
73 
74 private:
75     static const size_t rxring_len = 64;
76     static const size_t txring_len = 64;
77     static const size_t rxbuffer_len = 2048;
78 
79     uint32_t read_reg(e1000_reg reg);
80     void write_reg(e1000_reg reg, uint32_t val);
81     uint16_t read_eeprom(uint8_t offset);
82 
83     handler_return irq_handler();
84 
85     void add_pktbuf_to_rxring(pktbuf_t *pkt);
86     void add_pktbuf_to_rxring_locked(pktbuf_t *pkt);
87 
88     // counter of configured deices
89     static volatile int global_count_;
90     int unit_ = 0;
91 
92     // main spinlock
93     spin_lock_t lock_ = SPIN_LOCK_INITIAL_VALUE;
94 
95     // configuration
96     pci_location_t loc_ = {};
97     void *bar0_regs_ = nullptr;
98     uint8_t mac_addr_[6] = {};
99     const e1000_id_features *id_feat_ = nullptr;
100 
101     // rx ring
102     rdesc *rxring_ = nullptr;
103     uint32_t rx_last_head_ = 0;
104     uint32_t rx_tail_ = 0;
105     pktbuf_t *rx_pktbuf_[rxring_len] = {};
106     uint8_t *rx_buf_ = nullptr; // rxbuffer_len * rxring_len byte buffer that rx_pktbuf[] points to
107 
108     // rx worker thread
109     list_node rx_queue_ = LIST_INITIAL_VALUE(rx_queue_);
110     event_t rx_event_ = EVENT_INITIAL_VALUE(rx_event_, 0, EVENT_FLAG_AUTOUNSIGNAL);
111     thread_t *rx_worker_thread_ = nullptr;
112     int rx_worker_routine();
113 
114     // tx ring
115     tdesc *txring_ = nullptr;
116     uint32_t tx_last_head_ = 0;
117     uint32_t tx_tail_ = 0;
118     pktbuf_t *tx_pktbuf_[txring_len] = {};
119 };
120 
read_reg(e1000_reg reg)121 uint32_t e1000::read_reg(e1000_reg reg) {
122     volatile uint32_t *r = (volatile uint32_t *)((uintptr_t)bar0_regs_ + (size_t)reg);
123 
124     return *r;
125 }
126 
write_reg(e1000_reg reg,uint32_t val)127 void e1000::write_reg(e1000_reg reg, uint32_t val) {
128     volatile uint32_t *r = (volatile uint32_t *)((uintptr_t)bar0_regs_ + (size_t)reg);
129 
130     *r = val;
131 }
132 
read_eeprom(uint8_t offset)133 uint16_t e1000::read_eeprom(uint8_t offset) {
134     // 8257x+ seems to have a different EERD layout
135     uint32_t val;
136     if (is_e1000e()) {
137         write_reg(e1000_reg::EERD, (offset << 2) | 0x1); // data + start bit
138 
139         // spin while bit 1 (DONE) is clear
140         while (((val = read_reg(e1000_reg::EERD)) & (1<<1)) == 0)
141             ;
142     } else {
143         write_reg(e1000_reg::EERD, (offset << 8) | 0x1); // data + start bit
144 
145         // spin while bit 4 (DONE) is clear
146         while (((val = read_reg(e1000_reg::EERD)) & (1<<4)) == 0)
147             ;
148     }
149     return val >> 16;
150 }
151 
152 volatile int e1000::global_count_ = 0;
153 
154 e1000::e1000() = default;
~e1000()155 e1000::~e1000() {
156     // TODO: free resources
157 }
158 
irq_handler()159 handler_return e1000::irq_handler() {
160     // read the interrupt cause register, which also auto clears all bits
161     auto icr = read_reg(e1000_reg::ICR);
162     if (!icr) {
163         return INT_NO_RESCHEDULE;
164     }
165 
166     LTRACEF("icr %#x\n", icr);
167 
168     AutoSpinLockNoIrqSave guard(&lock_);
169 
170     handler_return ret = INT_NO_RESCHEDULE;
171 
172     if (icr & (1<<0)) { // TXDW - transmit descriptor written back
173         PANIC_UNIMPLEMENTED;
174     }
175     if (icr & (1<<1)) { // TXQE - transmit queue empty
176         //PANIC_UNIMPLEMENTED;
177         // nothing to really do here
178     }
179     if (icr & (1<<6)) {
180         printf("e1000: RX OVERRUN\n");
181     }
182     if (icr & (1<<7)) { // RXTO - rx timer interrupt
183         // rx timer fired, packets are probably ready
184         auto rdh = read_reg(e1000_reg::RDH);
185         auto rdt = read_reg(e1000_reg::RDT);
186 
187         while (rx_last_head_ != rdh) {
188             // copy the current rx descriptor locally for better cache performance
189             rdesc rxd;
190             copy(&rxd, rxring_ + rx_last_head_);
191 
192             LTRACEF("last_head %#x RDH %#x RDT %#x\n", rx_last_head_, rdh, rdt);
193             if (LOCAL_TRACE) rxd.dump();
194 
195             // recover the pktbuf we queued in this spot
196             DEBUG_ASSERT(rx_pktbuf_[rx_last_head_]);
197             DEBUG_ASSERT(pktbuf_data_phys(rx_pktbuf_[rx_last_head_]) == rxd.addr);
198             pktbuf_t *pkt = rx_pktbuf_[rx_last_head_];
199 
200             bool consumed_pkt = false;
201             if (rxd.status & (1 << 0)) { // descriptor done, we own it now
202                 if (rxd.status & (1<<1)) { // end of packet
203                     if (rxd.errors == 0) {
204                         // good packet, trim data len according to the rx descriptor
205                         pkt->dlen = rxd.length;
206                         pkt->flags |= PKTBUF_FLAG_EOF; // just to make sure
207 
208                         // queue it in the rx queue
209                         list_add_tail(&rx_queue_, &pkt->list);
210 
211                         // wake up the rx worker
212                         event_signal(&rx_event_, false);
213                         ret = INT_RESCHEDULE;
214                         consumed_pkt = true;
215                     }
216                 }
217             }
218             if (!consumed_pkt) {
219                 // TODO: return the pkt to the ring
220                 add_pktbuf_to_rxring_locked(pkt);
221             }
222 
223             rx_last_head_ = (rx_last_head_ + 1) % rxring_len;
224         }
225     }
226     return ret;
227 }
228 
rx_worker_routine()229 int e1000::rx_worker_routine() {
230     for (;;) {
231         event_wait(&rx_event_);
232 
233         // pull some packets from the received queue
234         for (;;) {
235             pktbuf_t *p;
236 
237             {
238                 AutoSpinLock guard(&lock_);
239 
240                 p = list_remove_head_type(&rx_queue_, pktbuf_t, list);
241             }
242 
243             if (!p) {
244                 break; // nothing left in the queue, go back to waiting
245             }
246 
247             if (LOCAL_TRACE) {
248                 LTRACEF("got packet: ");
249                 pktbuf_dump(p);
250             }
251 
252             // push it up the stack
253             minip_rx_driver_callback(p);
254 
255             // we own the pktbuf again
256 
257             // set the data pointer to the start of the buffer and set dlen to 0
258             pktbuf_reset(p, 0);
259 
260             // add it back to the rx ring at the current tail
261             add_pktbuf_to_rxring(p);
262         }
263     }
264 
265     return 0;
266 }
267 
tx(pktbuf_t * p)268 int e1000::tx(pktbuf_t *p) {
269     LTRACE;
270     if (LOCAL_TRACE) {
271         pktbuf_dump(p);
272     }
273 
274     // build a tx descriptor and stuff it in the tx ring
275     tdesc td = {};
276     td.addr = pktbuf_data_phys(p);
277     td.length = p->dlen;
278     td.cmd = (1<<0); // end of packet (EOP)
279     copy(&txring_[tx_tail_], &td);
280 
281     // save a copy of the pktbuf in our list
282     tx_pktbuf_[tx_tail_] = p;
283 
284     // bump tail forward
285     tx_tail_ = (tx_tail_ + 1) % txring_len;
286     write_reg(e1000_reg::TDT, tx_tail_);
287 
288     LTRACEF("TDH %#x TDT %#x\n", read_reg(e1000_reg::TDH), read_reg(e1000_reg::TDT));
289 
290     return NO_ERROR;
291 }
292 
add_pktbuf_to_rxring_locked(pktbuf_t * p)293 void e1000::add_pktbuf_to_rxring_locked(pktbuf_t *p) {
294     DEBUG_ASSERT(p);
295     DEBUG_ASSERT(p->dlen == 0);
296     DEBUG_ASSERT(p->blen == rxbuffer_len);
297 
298     // add it to the next rxring entry at the tail
299     rdesc rd = {};
300     rd.addr = pktbuf_data_phys(p);
301     copy(&rxring_[rx_tail_], &rd);
302 
303     // save a copy of the pktbuf in our list
304     rx_pktbuf_[rx_tail_] = p;
305 
306     // bump tail forward
307     rx_tail_ = (rx_tail_ + 1) % rxring_len;
308     write_reg(e1000_reg::RDT, rx_tail_);
309 
310     LTRACEF("after RDH %#x RDT %#x\n", read_reg(e1000_reg::RDH), read_reg(e1000_reg::RDT));
311 }
312 
add_pktbuf_to_rxring(pktbuf_t * pkt)313 void e1000::add_pktbuf_to_rxring(pktbuf_t *pkt) {
314     AutoSpinLock guard(&lock_);
315 
316     add_pktbuf_to_rxring_locked(pkt);
317 }
318 
init_device(pci_location_t loc,const e1000_id_features * id)319 status_t e1000::init_device(pci_location_t loc, const e1000_id_features *id) {
320     loc_ = loc;
321     id_feat_ = id;
322     char str[32];
323 
324     LTRACEF("pci location %s\n", pci_loc_string(loc_, str));
325 
326     pci_bar_t bars[6];
327     status_t err = pci_bus_mgr_read_bars(loc_, bars);
328     if (err != NO_ERROR) return err;
329 
330     LTRACEF("e1000 BARS:\n");
331     if (LOCAL_TRACE) pci_dump_bars(bars, 6);
332 
333     if (!bars[0].valid || bars[0].addr == 0) {
334         return ERR_NOT_FOUND;
335     }
336 
337     // allocate a unit number
338     unit_ = atomic_add(&global_count_, 1);
339 
340     // map bar 0, main memory mapped register interface, 128KB
341     snprintf(str, sizeof(str), "e1000 %d bar0", unit_);
342     err = vmm_alloc_physical(vmm_get_kernel_aspace(), str, 128*1024, &bar0_regs_, 0,
343                              bars[0].addr, /* vmm_flags */ 0, ARCH_MMU_FLAG_UNCACHED_DEVICE);
344     if (err != NO_ERROR) {
345         return ERR_NOT_FOUND;
346     }
347 
348     LTRACEF("bar 0 regs mapped to %p\n", bar0_regs_);
349 
350     pci_bus_mgr_enable_device(loc_);
351 
352     // read the mac address out of the eeprom
353     uint16_t tmp;
354     tmp = read_eeprom(0);
355     mac_addr_[0] = tmp & 0xff;
356     mac_addr_[1] = tmp >> 8;
357     tmp = read_eeprom(1);
358     mac_addr_[2] = tmp & 0xff;
359     mac_addr_[3] = tmp >> 8;
360     tmp = read_eeprom(2);
361     mac_addr_[4] = tmp & 0xff;
362     mac_addr_[5] = tmp >> 8;
363 
364     printf("e1000 %d: mac address %02x:%02x:%02x:%02x:%02x:%02x\n", unit_, mac_addr_[0], mac_addr_[1], mac_addr_[2],
365            mac_addr_[3], mac_addr_[4], mac_addr_[5]);
366 
367     // allocate and map space for the rx and tx ring
368     snprintf(str, sizeof(str), "e1000 %d rxring", unit_);
369     err = vmm_alloc_contiguous(vmm_get_kernel_aspace(), str, rxring_len * sizeof(rdesc), (void **)&rxring_, 0, 0, ARCH_MMU_FLAG_UNCACHED);
370     if (err != NO_ERROR) {
371         return ERR_NOT_FOUND;
372     }
373     memset(rxring_, 0, rxring_len * sizeof(rdesc));
374 
375     paddr_t rxring_phys = vaddr_to_paddr(rxring_);
376     LTRACEF("rx ring at %p, physical %#lx\n", rxring_, rxring_phys);
377 
378     snprintf(str, sizeof(str), "e1000 %d txring", unit_);
379     err = vmm_alloc_contiguous(vmm_get_kernel_aspace(), str, txring_len * sizeof(tdesc), (void **)&txring_, 0, 0, ARCH_MMU_FLAG_UNCACHED);
380     if (err != NO_ERROR) {
381         return ERR_NOT_FOUND;
382     }
383     memset(txring_, 0, txring_len * sizeof(rdesc));
384 
385     paddr_t txring_phys = vaddr_to_paddr(txring_);
386     LTRACEF("tx ring at %p, physical %#lx\n", txring_, txring_phys);
387 
388     // allocate a large array of contiguous buffers to receive into
389     snprintf(str, sizeof(str), "e1000 %d rx buffers", unit_);
390     err = vmm_alloc_contiguous(vmm_get_kernel_aspace(), str, rxring_len * rxbuffer_len, (void **)&rx_buf_, 0, 0, 0);
391     if (err != NO_ERROR) {
392         return ERR_NOT_FOUND;
393     }
394 
395     // mask all IRQs
396     write_reg(e1000_reg::IMC, 0xffff);
397 
398     // qemus 82574 emulation seems to want IAME to be set to auto-clear ICR bits.
399     if (is_e1000e()) {
400         auto ctrl_ext = read_reg(e1000_reg::CTL_EXT);
401         write_reg(e1000_reg::CTL_EXT, ctrl_ext | (1<<27)); // IAME - interrupt ack auto-mask
402         write_reg(e1000_reg::IAM, 0); // set such that no IMS bits are auto cleared
403     }
404 
405     // set the interrupt treshold reg
406     const uint32_t irq_rate = 10000; // max 10k irqs/sec
407     write_reg(e1000_reg::ITR, 1000000 / irq_rate * 4);
408     if (is_e1000e()) {
409         write_reg(e1000_reg::EITR0, 1000000 / irq_rate * 4);
410         write_reg(e1000_reg::EITR1, 1000000 / irq_rate * 4);
411         write_reg(e1000_reg::EITR2, 1000000 / irq_rate * 4);
412         write_reg(e1000_reg::EITR3, 1000000 / irq_rate * 4);
413         write_reg(e1000_reg::EITR4, 1000000 / irq_rate * 4);
414     }
415 
416     // disable tx and rx
417     write_reg(e1000_reg::RCTL, 0);
418     write_reg(e1000_reg::TCTL, 0);
419 
420     // irq handler lambda to get to inner method
421     auto irq_handler_wrapper = [](void *arg) -> handler_return {
422         e1000 *e = (e1000 *)arg;
423         return e->irq_handler();
424     };
425 
426     // allocate a MSI interrupt
427     uint irq_base;
428     err = pci_bus_mgr_allocate_msi(loc_, 1, &irq_base);
429     if (err != NO_ERROR) {
430         // fall back to regular IRQs
431         err = pci_bus_mgr_allocate_irq(loc_, &irq_base);
432         if (err != NO_ERROR) {
433             printf("e1000: unable to allocate IRQ\n");
434             return err;
435         }
436         register_int_handler(irq_base, irq_handler_wrapper, this);
437     } else {
438         register_int_handler_msi(irq_base, irq_handler_wrapper, this, true);
439     }
440     LTRACEF("IRQ number %#x\n", irq_base);
441 
442     unmask_interrupt(irq_base);
443 
444     // set up the rx ring
445     write_reg(e1000_reg::RDBAL, rxring_phys & 0xffffffff);
446 #if __INTPTR_WIDTH__ == 64
447     write_reg(e1000_reg::RDBAH, rxring_phys >> 32);
448 #else
449     write_reg(e1000_reg::RDBAH, 0);
450 #endif
451     write_reg(e1000_reg::RDLEN, rxring_len * sizeof(rdesc));
452     // set head and tail to 0
453     write_reg(e1000_reg::RDH, 0);
454     write_reg(e1000_reg::RDT, 0);
455 
456     // disable receive delay timer and absolute delay timer
457     write_reg(e1000_reg::RDTR, 0);
458     write_reg(e1000_reg::RADV, 0);
459     // disable small packet detect
460     write_reg(e1000_reg::RSRPD, 0);
461 
462     // set up the flow control thresholds
463     write_reg(e1000_reg::FCRTL, 0);
464     write_reg(e1000_reg::FCRTH, 0);
465 
466     // fill the rx ring with pktbufs
467     rx_last_head_ = read_reg(e1000_reg::RDH);
468     rx_tail_ = read_reg(e1000_reg::RDT);
469     for (size_t i = 0; i < rxring_len - 1; i++) {
470         // construct a 2K pktbuf, pointing outo our rx_buf_ block of memory
471         auto *pkt = pktbuf_alloc_empty();
472         if (!pkt) {
473             break;
474         }
475         pktbuf_add_buffer(pkt, rx_buf_ + i * rxbuffer_len, rxbuffer_len, 0, 0, nullptr, nullptr);
476 
477         add_pktbuf_to_rxring_locked(pkt);
478     }
479     //hexdump(rxring_, rxring_len * sizeof(rdesc));
480 
481     // start rx worker thread
482     auto wrapper_lambda = [](void *arg) -> int {
483         e1000 *e = (e1000 *)arg;
484         return e->rx_worker_routine();
485     };
486     snprintf(str, sizeof(str), "e1000 %d rx worker", unit_);
487     rx_worker_thread_ = thread_create(str, wrapper_lambda, this, HIGH_PRIORITY, DEFAULT_STACK_SIZE);
488     thread_resume(rx_worker_thread_);
489 
490     // start receiver
491     // enable RX, unicast permiscuous, multicast permiscuous, broadcast accept, BSIZE 2048
492     write_reg(e1000_reg::RCTL, (1<<1) | (1<<3) | (1<<4) | (1<<15) | (0<<16));
493 
494     // unmask receive irq
495     auto ims = read_reg(e1000_reg::IMS);
496     write_reg(e1000_reg::IMS, ims | (1<<7) | (1<<6)); // RXO, RXTO
497 
498     // set up the tx path
499     write_reg(e1000_reg::TDH, 0);
500     write_reg(e1000_reg::TDT, 0);
501     tx_last_head_ = 0;
502     tx_tail_ = 0;
503 
504     // set up the tx ring
505     write_reg(e1000_reg::TDBAL, txring_phys & 0xffffffff);
506 #if __INTPTR_WIDTH__ == 64
507     write_reg(e1000_reg::TDBAH, txring_phys >> 32);
508 #else
509     write_reg(e1000_reg::TDBAH, 0);
510 #endif
511     write_reg(e1000_reg::TDLEN, txring_len * sizeof(tdesc));
512 
513     // enable the transmitter and appropriate irqs
514     write_reg(e1000_reg::TCTL, (1<<3) | (1<<1)); // short packet pad, tx enable
515 
516     // unmask tx irq
517     ims = read_reg(e1000_reg::IMS);
518     write_reg(e1000_reg::IMS, ims | (1<<1) | (1<<0)); // transmit queue empty, tx descriptor write back
519 
520     return NO_ERROR;
521 }
522 
523 extern "C"
e1000_register_with_minip()524 status_t e1000_register_with_minip() {
525     auto tx_routine = [](void *arg, pktbuf_t *p) {
526         auto *e = static_cast<e1000 *>(arg);
527         return e->tx(p);
528     };
529 
530     if (the_e) {
531         minip_set_eth(tx_routine, the_e, the_e->mac_addr());
532         return NO_ERROR;
533     }
534 
535     return ERR_NOT_FOUND;
536 }
537 
e1000_init(uint level)538 static void e1000_init(uint level) {
539     LTRACE_ENTRY;
540 
541     auto ac = lk::make_auto_call([]() { LTRACE_EXIT; });
542 
543     // probe pci to find a device
544     for (auto id:  e1000_ids) {
545         for (size_t i = 0; ; i++) {
546             pci_location_t loc;
547             status_t err = pci_bus_mgr_find_device(&loc, id.id, 0x8086, i);
548             if (err != NO_ERROR) {
549                 break;
550             }
551 
552             // we maybe found one, create a new device and initialize it
553             auto e = new e1000;
554             err = e->init_device(loc, &id);
555             if (err != NO_ERROR) {
556                 char str[14];
557                 printf("e1000: device at %s failed to initialize\n", pci_loc_string(loc, str));
558                 delete e;
559                 continue;
560             }
561 
562             // XXX first e1000 found is remembered
563             the_e = e;
564         }
565     }
566 }
567 
568 LK_INIT_HOOK(e1000, &e1000_init, LK_INIT_LEVEL_PLATFORM + 1);
569