@@ -169,6 +169,7 @@ struct efhw_nic_af_xdp
169169 struct efhw_af_xdp_vi * vi ;
170170 struct efhw_buddy_allocator vi_allocator ;
171171 spinlock_t alloc_lock ;
172+ struct xdp_mmap_offsets mmap_offsets ;
172173};
173174
174175/*----------------------------------------------------------------------------
@@ -583,41 +584,39 @@ static int xdp_create_ring(struct socket* sock,
583584 return 0 ;
584585}
585586
586- static int xdp_create_rings (struct socket * sock ,
587- struct efhw_page_map * page_map , void * kern_mem_base ,
588- long rxq_capacity , long txq_capacity ,
589- struct efab_af_xdp_offsets_rings * kern_offsets ,
590- struct efab_af_xdp_offsets_rings * user_offsets ,
591- struct ring_map * ring_mapping )
587+ static int xdp_get_mmap_offsets (struct xdp_mmap_offsets * mmap_offsets )
592588{
593- int rc ;
589+ struct socket * sock ;
594590 struct sys_call_area rw_area ;
595- struct xdp_mmap_offsets * mmap_offsets ;
591+ struct xdp_mmap_offsets * mmap_offsets_user ;
596592 int * optlen ;
593+ int rc ;
597594
598- EFHW_BUILD_ASSERT (EFAB_AF_XDP_DESC_BYTES == sizeof (struct xdp_desc ));
595+ rc = sock_create (AF_XDP , SOCK_RAW , 0 , & sock );
596+ if ( rc < 0 )
597+ return rc ;
599598
600599 /* We need a read-write area to call getsockopt(). We unmap it from UL
601600 * as soon as possible. */
602601 rc = sys_call_area_alloc (& rw_area );
603602 if ( rc < 0 )
604- return rc ;
603+ goto out_release ;
605604
606- mmap_offsets = sys_call_area_ptr (& rw_area );
607- optlen = (void * )(mmap_offsets + 1 );
608- * optlen = sizeof (* mmap_offsets );
605+ mmap_offsets_user = sys_call_area_ptr (& rw_area );
606+ optlen = (void * )(mmap_offsets_user + 1 );
607+ * optlen = sizeof (* mmap_offsets_user );
609608
610609 /* For linux<=5.7 you can use kernel_getsockopt(),
611610 * but newer versions does not have this function, so we have all that
612611 * sys_call_area_*() calls. */
613612 rc = sock -> ops -> getsockopt (sock , SOL_XDP , XDP_MMAP_OFFSETS ,
614613 (void * )sys_call_area_user_addr (& rw_area ,
615- mmap_offsets ),
614+ mmap_offsets_user ),
616615 (void * )sys_call_area_user_addr (& rw_area , optlen ));
617616
618- /* Security consideration: mmap_offsets is located in untrusted user
617+ /* Security consideration: mmap_offsets_user is located in untrusted user
619618 * memory. I.e. the process can overwrite all this data.
620- * However this is the process which can create an AF_XDP Onload stack ,
619+ * However this is the process which can load an XDP program ,
621620 * so it runs with the root account, and it already can do
622621 * anything bad: reboot, execute arbitrary code, etc.
623622 *
@@ -626,44 +625,60 @@ static int xdp_create_rings(struct socket* sock,
626625 sys_call_area_unmap (& rw_area );
627626 if ( rc < 0 ) {
628627 EFHW_ERR ("%s: getsockopt(XDP_MMAP_OFFSETS) rc=%d" , __func__ , rc );
629- goto out ;
628+ goto out_unpin ;
630629 }
631- EFHW_ASSERT (* optlen == sizeof (* mmap_offsets ));
630+ EFHW_ASSERT (* optlen == sizeof (* mmap_offsets_user ));
631+ 632+ memcpy (mmap_offsets , mmap_offsets_user , sizeof (* mmap_offsets_user ));
633+ rc = 0 ;
634+ 635+ out_unpin :
636+ sys_call_area_unpin (& rw_area );
637+ out_release :
638+ sock_release (sock );
639+ return rc ;
640+ }
641+ 642+ static int xdp_create_rings (struct socket * sock , struct efhw_nic_af_xdp * xdp ,
643+ struct efhw_page_map * page_map , void * kern_mem_base ,
644+ long rxq_capacity , long txq_capacity ,
645+ struct efab_af_xdp_offsets_rings * kern_offsets ,
646+ struct efab_af_xdp_offsets_rings * user_offsets ,
647+ struct ring_map * ring_mapping )
648+ {
649+ int rc ;
650+ 651+ EFHW_BUILD_ASSERT (EFAB_AF_XDP_DESC_BYTES == sizeof (struct xdp_desc ));
632652
633653 rc = xdp_create_ring (sock , page_map , kern_mem_base ,
634654 rxq_capacity , sizeof (struct xdp_desc ),
635655 XDP_RX_RING , XDP_PGOFF_RX_RING ,
636- & mmap_offsets -> rx , & kern_offsets -> rx , & user_offsets -> rx ,
656+ & xdp -> mmap_offsets . rx , & kern_offsets -> rx , & user_offsets -> rx ,
637657 ring_mapping ++ );
638658 if ( rc < 0 )
639- goto out ;
659+ return rc ;
640660
641661 rc = xdp_create_ring (sock , page_map , kern_mem_base ,
642662 txq_capacity , sizeof (struct xdp_desc ),
643663 XDP_TX_RING , XDP_PGOFF_TX_RING ,
644- & mmap_offsets -> tx , & kern_offsets -> tx , & user_offsets -> tx ,
664+ & xdp -> mmap_offsets . tx , & kern_offsets -> tx , & user_offsets -> tx ,
645665 ring_mapping ++ );
646666 if ( rc < 0 )
647- goto out ;
667+ return rc ;
648668
649669 rc = xdp_create_ring (sock , page_map , kern_mem_base ,
650670 rxq_capacity , sizeof (uint64_t ),
651671 XDP_UMEM_FILL_RING , XDP_UMEM_PGOFF_FILL_RING ,
652- & mmap_offsets -> fr , & kern_offsets -> fr , & user_offsets -> fr ,
672+ & xdp -> mmap_offsets . fr , & kern_offsets -> fr , & user_offsets -> fr ,
653673 ring_mapping ++ );
654674 if ( rc < 0 )
655- goto out ;
675+ return rc ;
656676
657677 rc = xdp_create_ring (sock , page_map , kern_mem_base ,
658678 txq_capacity , sizeof (uint64_t ),
659679 XDP_UMEM_COMPLETION_RING , XDP_UMEM_PGOFF_COMPLETION_RING ,
660- & mmap_offsets -> cr , & kern_offsets -> cr , & user_offsets -> cr ,
680+ & xdp -> mmap_offsets . cr , & kern_offsets -> cr , & user_offsets -> cr ,
661681 ring_mapping );
662- if ( rc < 0 )
663- goto out ;
664- 665- out :
666- sys_call_area_unpin (& rw_area );
667682 return rc ;
668683}
669684
@@ -730,6 +745,8 @@ static int af_xdp_init(struct efhw_nic* nic, int instance,
730745 struct socket * sock ;
731746 struct file * file ;
732747 struct efab_af_xdp_offsets * user_offsets ;
748+ const struct cred * old_cred ;
749+ struct cred * cred ;
733750
734751 if ( chunk_size == 0 ||
735752 chunk_size < headroom ||
@@ -749,18 +766,24 @@ static int af_xdp_init(struct efhw_nic* nic, int instance,
749766 if ( sw_bt == NULL )
750767 return - EINVAL ;
751768
769+ cred = prepare_kernel_cred (& init_task );
770+ if ( cred == NULL )
771+ return - ENOMEM ;
772+ old_cred = override_creds (cred );
773+ 752774 /* We need to use network namespace of network device so that
753775 * ifindex passed in bpf syscalls makes sense
754776 * TODO AF_XDP: there is a race here with device changing netns
755- * TODO AF_XDP: this fails unless the user namespace has CAP_NET_RAW
756777 */
757778 rc = __sock_create (dev_net (nic -> net_dev ), AF_XDP , SOCK_RAW , 0 , & sock , 0 );
758779 if ( rc < 0 )
759- return rc ;
780+ goto fail_cred ;
760781
761782 file = sock_alloc_file (sock , 0 , NULL );
762- if ( IS_ERR (file ) )
763- return PTR_ERR (file );
783+ if ( IS_ERR (file ) ) {
784+ rc = PTR_ERR (file );
785+ goto fail_cred ;
786+ }
764787 vi -> sock = sock ;
765788
766789 rc = efhw_page_alloc_zeroed (& vi -> user_offsets_page );
@@ -776,7 +799,7 @@ static int af_xdp_init(struct efhw_nic* nic, int instance,
776799 if ( rc < 0 )
777800 goto fail ;
778801
779- rc = xdp_create_rings (sock , page_map , & vi -> kernel_offsets ,
802+ rc = xdp_create_rings (sock , nic -> arch_extra , page_map , & vi -> kernel_offsets ,
780803 vi -> rxq_capacity , vi -> txq_capacity ,
781804 & vi -> kernel_offsets .rings , & user_offsets -> rings ,
782805 vi -> ring_mapping );
@@ -815,11 +838,17 @@ static int af_xdp_init(struct efhw_nic* nic, int instance,
815838 add_wait_queue (sk_sleep (vi -> sock -> sk ), & vi -> waiter .wait );
816839
817840 user_offsets -> mmap_bytes = efhw_page_map_bytes (page_map );
841+ 842+ revert_creds (old_cred );
843+ put_cred (cred );
818844 return 0 ;
819845
820846 fail :
821847 vi -> waiter .wait .func = NULL ;
822848 xdp_release_vi (nic , vi );
849+ fail_cred :
850+ revert_creds (old_cred );
851+ put_cred (cred );
823852 return rc ;
824853}
825854
@@ -927,6 +956,10 @@ __af_xdp_nic_init_hardware(struct efhw_nic *nic,
927956
928957 spin_lock_init (& xdp -> alloc_lock );
929958
959+ rc = xdp_get_mmap_offsets (& xdp -> mmap_offsets );
960+ if ( rc < 0 )
961+ goto fail_map ;
962+ 930963 rc = af_xdp_vi_allocator_ctor (xdp , nic -> vi_min , nic -> vi_lim );
931964 if ( rc < 0 )
932965 goto fail_map ;
0 commit comments