blogs.oracle.com.html (134476B)
1 <!DOCTYPE html> 2 <html lang="en-US" class="no-js"> 3 4 <head> 5 <!-- Avoid FOUC issue in FF with async loading of style sheets --> 6 <style> 7 body { 8 opacity: 1; 9 } 10 </style> 11 <title>Syscall latency... and some uses of speculative execution</title> 12 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> 13 <meta charset="utf-8"> 14 <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> 15 <link rel="alternate" type="application/rss+xml" href="https://blogs.oracle.com/rss"> 16 <!-- $meta_tags --> 17 <meta name="country" content=""> 18 <meta name="contenttype_id" content="WM147046"> 19 <meta name="description" content="An in-depth exploration into why syscall latency increased on some x86 kernels in recent kernels. "> 20 <link rel="canonical" href=""> 21 <meta name="host_name" content="https://blogs.oracle.com"> 22 <meta name="title" content="Syscall latency... and some uses of speculative execution"> 23 <meta name="blog_name" content="Oracle Linux Blog"> 24 <meta name="author" content="Ankur"> 25 <meta name="keywords" content="Technologies,Linux Kernel Development"> 26 <meta name="publish_date" content="September 12, 2023"> 27 <meta name="siteid" content="us"> 28 <meta name="Language" content=""> 29 <meta name="robots" content="index, follow"> 30 <meta name="audience" content=""> 31 <meta name="product" content=""> 32 <meta property="og:type" content="blog"> 33 <meta property="og:title" content="Syscall latency... and some uses of speculative execution"> 34 <meta property="og:image" content="https://blogs.oracle.com/content/published/api/v1.1/assets/CONTCF8836A82B014903A5283C76DE901346/Medium?format=jpg&channelToken=3189ef66cf584820b5b19e6b10792d6f"> 35 <meta property="og:description" content="An in-depth exploration into why syscall latency increased on some x86 kernels in recent kernels. "> 36 <meta property="og:url" content="https://blogs.oracle.com/linux/post/syscall-latency"> 37 <meta name="category" content="Technologies,Linux Kernel Development"> 38 <meta name="twitter:card" content="summary_large_image"> 39 <meta name="twitter:title" content="" /> 40 <meta name="twitter:description" content="" /> 41 <meta name="twitter:image" content="" /> 42 <meta name="google-site-verification" content="OVRFC0CuVBZNzlfzelWzFIN7D4gCrVfzsfmMWvteKHs" /> 43 <link rel="alternate" type="application/rss+xml" class="rss-link" title="Oracle Blogs" 44 href="https://blogs.oracle.com/rss"> 45 46 47 <!-- <link data-wscss href="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/css/blogs-style.css" rel="preload" as="style" onload="this.rel='stylesheet';" onerror="this.rel='stylesheet'"> --> 48 <link rel="preload" onload="this.rel='stylesheet'" href="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/css/blogs-style.css" as="style" /> 49 <link rel="preload" onload="this.rel='stylesheet'" href="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/css/oracle-style.css" as="style" /> 50 51 <!-- favicon --> 52 <link rel="icon" href="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/img/logo.ico" type="image/x-icon" /> 53 <link rel="shortcut icon" href="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/img/logo.ico" type="image/x-icon" /> 54 55 <link rel="preconnect" href="https://c.go-mpulse.net" crossorigin /> 56 <link rel="preconnect" href="https://s.go-mpulse.net" crossorigin /> 57 <link rel="preconnect" href="https://tms.oracle.com" crossorigin /> 58 <link rel="preconnect" href="https://www.facebook.com" crossorigin /> 59 <link rel="preconnect" href="https://connect.facebook.net" crossorigin /> 60 <link rel="preconnect" href="https://www.oracle.com" crossorigin /> 61 <link rel="preconnect" href="https://consent.trustarc.com" crossorigin /> 62 <link rel="preconnect" href="https://www.oracleimg.com" crossorigin /> 63 <link rel="preconnect" href="https://oracle.112.2o7.net" crossorigin /> 64 <link rel="preconnect" href="https://trial-eum-clientnsv4-s.akamaihd.net" crossorigin /> 65 <link rel="preconnect" href="https://trial-eum-clienttons-s.akamaihd.net" crossorigin /> 66 <link rel="preconnect" href="https://d.oracleinfinity.io" crossorigin /> 67 <link rel="preconnect" href="https://www.googletagmanager.com" crossorigin /> 68 69 <link rel="dns-prefetch" href="https://static.ocecdn.oraclecloud.com" /> 70 <link rel="dns-prefetch" href="https://c.go-mpulse.net" /> 71 <link rel="dns-prefetch" href="https://s.go-mpulse.net" /> 72 <link rel="dns-prefetch" href="https://tms.oracle.com" /> 73 <link rel="dns-prefetch" href="https://www.facebook.com" /> 74 <link rel="dns-prefetch" href="https://connect.facebook.net" /> 75 <link rel="dns-prefetch" href="https://www.oracle.com" /> 76 <link rel="dns-prefetch" href="https://consent.trustarc.com" /> 77 <link rel="dns-prefetch" href="https://www.oracleimg.com" /> 78 <link rel="dns-prefetch" href="https://oracle.112.2o7.net" /> 79 <link rel="dns-prefetch" href="[https://trial-eum-clientnsv4-s.akamaihd.net" /> 80 <link rel="dns-prefetch" href="https://trial-eum-clienttons-s.akamaihd.net" /> 81 <link rel="dns-prefetch" href="https://d.oracleinfinity.io" /> 82 <link rel="dns-prefetch" href="https://www.googletagmanager.com" /> 83 84 <script type="text/javascript" src="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/js/jquery/jquery-min.js" 85 onload="$('head link[data-reqjq][rel=preload]').each(function(){var a = document.createElement('script');a.async=false;a.src=$(this).attr('href');this.parentNode.insertBefore(a, this);});$(function(){$('script[data-reqjq][data-src]').each(function(){this.async=true;this.src=$(this).data('src');});});"></script> 86 87 <!-- <script type="text/javascript" src="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/js/blogs-script.js"></script> --> 88 89 90 <!--<link data-wsjs data-reqjq href="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/js/blogs-script.js" rel="preload" as="script">-> 91 92 <script type="text/javascript" src="http://webstandards.us.oracle.com:9292/global_assets_v22.9.3/assets/js/redwood-blogs.js"></script> --> 93 94 95 <script id="scsRenderInfo" type="application/json">{"sitePrefix":"../","pageModel":{"properties":{"title":"Blog Theme - Details","pageLayout":"post-detail.html","mobileLayout":"","pageDescription":"","keywords":"","hideFromSearchEngines":false,"styles":[],"header":" ","footer":"","noIndex":false,"noFollow":false,"noArchive":false,"noSnippet":false,"isCobrowseEnabled":false,"overrideWebAnalytics":false,"webAnalyticsScript":null},"slots":{"post-id":{"components":["a37b49d5-e11f-4e1f-a5e0-fd37af71a288"],"grid":"<div class=\"scs-row\"><div class=\"scs-col\" style=\"width: 100%;\"><div id=\"a37b49d5-e11f-4e1f-a5e0-fd37af71a288\"></div></div></div>","preRenderedByController":true},"blog-search1":{"components":[],"grid":""},"homepage-banner":{"components":["c30bb2b5-2186-4cd6-aeb8-2f23c0d9360c"],"grid":"<div class=\"scs-row\"><div class=\"scs-col\" style=\"width: 100%;\"><div id=\"c30bb2b5-2186-4cd6-aeb8-2f23c0d9360c\"></div></div></div>"},"search":{"components":[],"grid":""},"recent-posts":{"components":[],"grid":""},"category-id":{"components":[],"grid":""},"blogs-category-nav":{"components":["f62eb3cd-6ac8-407d-9e06-69cbbc8d821e"],"grid":"<div class=\"scs-row\"><div class=\"scs-col\" style=\"width: 100%;\"><div id=\"f62eb3cd-6ac8-407d-9e06-69cbbc8d821e\"></div></div></div>","preRenderedByController":true},"Next-Previous-Posts":{"components":["a057a3dc-2397-4b35-88dc-e9904a3f1789"],"grid":"<div class=\"scs-row\"><div class=\"scs-col\" style=\"width: 100%;\"><div id=\"a057a3dc-2397-4b35-88dc-e9904a3f1789\"></div></div></div>","preRenderedByController":true}},"componentInstances":{"c30bb2b5-2186-4cd6-aeb8-2f23c0d9360c":{"type":"scs-component","id":"Blogs-Email-Subscription","data":{"actions":"","alignment":"fill","assets":"","borderColor":"#808080","borderRadius":0,"borderStyle":"none","borderWidth":1,"componentId":"Blogs-Email-Subscription","componentName":"Blogs-Email-Subscription","componentFactory":"","componentLayout":"default","contentId":"","contentLayoutCategory":"","contentPlaceholder":false,"contentTypes":[],"contentViewing":"","customRenderComplete":false,"customSettingsData":{},"componentConfig":{"id":"Blogs-Email-Subscription","settingsData":{"settingsHeight":0,"settingsWidth":0,"settingsRenderOption":"none","componentLayouts":[],"styles":[{"name":"Medium Green","class":"Ora-Paragraph-Banner-default-style"},{"name":"Dark Green","class":"ora-paragraph-banner-dark-green"},{"name":"Orange","class":"ora-paragraph-banner-orange"},{"name":"Teal","class":"ora-paragraph-banner-teal"},{"name":"Medium Teal","class":"ora-paragraph-banner-medium-teal"},{"name":"Blue","class":"ora-paragraph-banner-blue"},{"name":"Medium Brown","class":"ora-paragraph-banner-medium-brown"},{"name":"Granite","class":"ora-paragraph-banner-granite"},{"name":"Ecru","class":"ora-paragraph-banner-ecru"},{"name":"Fog Blue","class":"ora-paragraph-banner-fog-blue"},{"name":"Yellow","class":"ora-paragraph-banner-yellow"},{"name":"Dark Brown","class":"ora-paragraph-banner-dark-brown"}],"triggers":[],"actions":[]}},"description":"","detailPageId":"","height":"","initialized":true,"isCaaSLayout":false,"linkType":"scs-link-action","marginBottom":0,"marginLeft":0,"marginRight":0,"marginTop":0,"nestedComponents":[{"id":"oraParagraphBannerCtaText","type":"scs-button","data":{"marginBottom":0,"marginLeft":0,"marginRight":0,"marginTop":0,"styleClass":"scs-button-secondary-style","useStyleClass":"true","visible":true,"width":0}}],"renderOnAccess":"","styleClass":"","styleClassName":"","seeded":false,"useStyleClass":"true","visible":true,"visibleOnMobile":"","visibleNestedComponents":[],"width":0}},"a37b49d5-e11f-4e1f-a5e0-fd37af71a288":{"type":"scs-component","id":"scs-contentplaceholder","data":{"actions":"","alignment":"fill","assets":"","borderColor":"#808080","borderRadius":0,"borderStyle":"none","borderWidth":1,"componentId":"","componentName":"","componentFactory":"","componentLayout":"","contentId":"","contentLayoutCategory":"Blogs-Redwood-Post-Detail-Layout","contentPlaceholder":true,"contentTypes":["Blog-Post","Syndicated-Blog-Post"],"contentTypeDisplayName":"","contentTypeCategory":"ContentType","contentViewing":"","customRenderComplete":false,"customSettingsData":"","componentConfig":"","description":"","detailPageId":"105","height":"","initialized":true,"isCaaSLayout":true,"linkType":"scs-link-action","marginBottom":5,"marginLeft":5,"marginRight":5,"marginTop":5,"nestedComponents":[],"renderOnAccess":"","styleClass":"","styleClassName":"","seeded":true,"useStyleClass":"true","visible":true,"visibleOnMobile":"","visibleNestedComponents":[],"width":0},"preRenderedByController":true},"f62eb3cd-6ac8-407d-9e06-69cbbc8d821e":{"type":"scs-component","id":"Blogs-Redwood-Category-Nav","data":{"actions":"","alignment":"fill","assets":"","borderColor":"#808080","borderRadius":0,"borderStyle":"none","borderWidth":1,"componentId":"sampleComp","componentName":"Blogs-Redwood-Category-Nav","componentFactory":"","componentLayout":"default","contentId":"","contentLayoutCategory":"","contentPlaceholder":false,"contentTypes":[],"contentTypeCategory":"ContentType","contentViewing":"","customRenderComplete":false,"customSettingsData":{"taxonomyId":"6BC2FEFCC79B49D7A757708E6D9BE2CB"},"componentConfig":{"id":"sample-component","settingsData":{"settingsHeight":240,"settingsWidth":300,"settingsRenderOption":"dialog","componentLayouts":[{"name":"default","displayName":"IMAGE_LEFT_LAYOUT"},{"name":"right","displayName":"IMAGE_RIGHT_LAYOUT"},{"name":"top","displayName":"IMAGE_TOP_LAYOUT"}],"styles":[{"name":"Bold","class":"sample-component-bold-style"},{"name":"Italic","class":"sample-component-italic-style"}],"triggers":[{"triggerName":"imageClicked","triggerDescription":"Image clicked","triggerPayload":[{"name":"payloadData","displayName":"Trigger Payload Data"}]}],"actions":[{"actionName":"setImageWidth","actionDescription":"Update the image width","actionPayload":[{"name":"imageWidth","description":"Image Width in pixels","type":{"ojComponent":{"component":"ojInputText"}},"value":""}]}]}},"description":"","detailPageId":"","height":"","initialized":true,"isCaaSLayout":false,"linkType":"scs-link-action","marginBottom":5,"marginLeft":5,"marginRight":5,"marginTop":5,"nestedComponents":[{"id":"imageId","type":"scs-image","data":{"imageUrl":"[!--$SCS_DIST_FOLDER--]/renderer/app/sdk/images/sample-image.png","marginBottom":0,"marginLeft":0,"marginRight":20,"marginTop":0}},{"id":"titleId","type":"scs-title","data":{"userText":"<div>Local Component</div>","fontColor":"#000000","fontFamily":"'Helvetica Neue Light', Helvetica, Arial, sans-serif","fontSize":20,"marginBottom":14,"marginLeft":0,"marginRight":0,"marginTop":0,"useStyleClass":"false"}},{"id":"paragraphId","type":"scs-paragraph","data":{"userText":"<p style=\"line-height:1.4em;margin-bottom:4px;\">As a page author, you can edit the content and settings for this component. To change settings, including triggers and actions, click the component menu and choose Settings.</p><p style=\"line-height:1.4em;\">As a component developer, you can change the component and its defaults (such as this text) either by working with the files directly through the components browser or by using the desktop app to work with the files on your local system.</p>","fontColor":"#333333","fontFamily":"'Helvetica Neue Regular', Helvetica, Arial, sans-serif","fontSize":14,"marginBottom":10,"marginLeft":0,"marginRight":0,"marginTop":0,"useStyleClass":"false"}}],"renderOnAccess":"","styleClass":"","styleClassName":"","seeded":true,"useStyleClass":"true","visible":true,"visibleOnMobile":"","visibleNestedComponents":[],"width":0},"preRenderedByController":true},"a057a3dc-2397-4b35-88dc-e9904a3f1789":{"type":"scs-component","id":"scs-contentplaceholder","data":{"actions":"","alignment":"fill","assets":"","borderColor":"#808080","borderRadius":0,"borderStyle":"none","borderWidth":1,"componentId":"","componentName":"","componentFactory":"","componentLayout":"","contentId":"","contentLayoutCategory":"Blogs-Redwood-Next-Prev-Post","contentPlaceholder":true,"contentTypes":["Blog-Post","Syndicated-Blog-Post"],"contentTypeDisplayName":"","contentTypeCategory":"ContentType","contentViewing":"","customRenderComplete":false,"customSettingsData":"","componentConfig":"","description":"","detailPageId":"105","height":"","initialized":true,"isCaaSLayout":true,"linkType":"scs-link-action","marginBottom":5,"marginLeft":5,"marginRight":5,"marginTop":5,"nestedComponents":[],"renderOnAccess":"","styleClass":"","styleClassName":"","seeded":true,"useStyleClass":"true","visible":true,"visibleOnMobile":"","visibleNestedComponents":[],"width":0},"preRenderedByController":true}}},"navigationCurr":105}</script> 96 <script id="scsRenderObject" type="text/javascript">var require = {waitSeconds: 0};</script> 97 98 99 100 <script type="text/javascript"> 101 window.SCSMacros = window.SCSMacros || {}; 102 var url = window.location.href.split('?')[0]; 103 var slug = url.substring(url.lastIndexOf('/') + 1); 104 //var slug = window.location.href.substring(window.location.href.lastIndexOf('/') + 1); 105 window.SCSMacros.getSlugMacro = slug; 106 </script> 107 108 109 <script type="text/javascript"> 110 function expand() { 111 var x = document.getElementsByClassName("u03-collapsed"); 112 var y = document.getElementsByClassName("u03-expanded"); 113 for (var i = 0; i < x.length; i += 1) { 114 x[i].style.display = 'none'; 115 } 116 for (var i = 0; i < y.length; i += 1) { 117 y[i].style.display = 'inline'; 118 } 119 } 120 121 function collapse() { 122 var x = document.getElementsByClassName("u03-collapsed"); 123 var y = document.getElementsByClassName("u03-expanded"); 124 for (var i = 0; i < x.length; i += 1) { 125 x[i].style.display = 'inline'; 126 } 127 for (var i = 0; i < y.length; i += 1) { 128 y[i].style.display = 'none'; 129 } 130 } 131 </script> 132 <!--DTM/Launch embed code - Header --> 133 134 135 <script>!function(e){var n="https://s.go-mpulse.net/boomerang/";if("False"=="True")e.BOOMR_config=e.BOOMR_config||{},e.BOOMR_config.PageParams=e.BOOMR_config.PageParams||{},e.BOOMR_config.PageParams.pci=!0,n="https://s2.go-mpulse.net/boomerang/";if(window.BOOMR_API_key="G52AM-AGLAF-9JTSA-TBAP5-PCJJE",function(){function e(){if(!o){var e=document.createElement("script");e.id="boomr-scr-as",e.src=window.BOOMR.url,e.async=!0,i.parentNode.appendChild(e),o=!0}}function t(e){o=!0;var n,t,a,r,d=document,O=window;if(window.BOOMR.snippetMethod=e?"if":"i",t=function(e,n){var t=d.createElement("script");t.id=n||"boomr-if-as",t.src=window.BOOMR.url,BOOMR_lstart=(new Date).getTime(),e=e||d.body,e.appendChild(t)},!window.addEventListener&&window.attachEvent&&navigator.userAgent.match(/MSIE [67]\./))return window.BOOMR.snippetMethod="s",void t(i.parentNode,"boomr-async");a=document.createElement("IFRAME"),a.src="about:blank",a.title="",a.role="presentation",a.loading="eager",r=(a.frameElement||a).style,r.width=0,r.height=0,r.border=0,r.display="none",i.parentNode.appendChild(a);try{O=a.contentWindow,d=O.document.open()}catch(_){n=document.domain,a.src="javascript:var d=document.open();d.domain='"+n+"';void(0);",O=a.contentWindow,d=O.document.open()}if(n)d._boomrl=function(){this.domain=n,t()},d.write("<bo"+"dy onload='document._boomrl();'>");else if(O._boomrl=function(){t()},O.addEventListener)O.addEventListener("load",O._boomrl,!1);else if(O.attachEvent)O.attachEvent("onload",O._boomrl);d.close()}function a(e){window.BOOMR_onload=e&&e.timeStamp||(new Date).getTime()}if(!window.BOOMR||!window.BOOMR.version&&!window.BOOMR.snippetExecuted){window.BOOMR=window.BOOMR||{},window.BOOMR.snippetStart=(new Date).getTime(),window.BOOMR.snippetExecuted=!0,window.BOOMR.snippetVersion=12,window.BOOMR.url=n+"G52AM-AGLAF-9JTSA-TBAP5-PCJJE";var i=document.currentScript||document.getElementsByTagName("script")[0],o=!1,r=document.createElement("link");if(r.relList&&"function"==typeof r.relList.supports&&r.relList.supports("preload")&&"as"in r)window.BOOMR.snippetMethod="p",r.href=window.BOOMR.url,r.rel="preload",r.as="script",r.addEventListener("load",e),r.addEventListener("error",function(){t(!0)}),setTimeout(function(){if(!o)t(!0)},3e3),BOOMR_lstart=(new Date).getTime(),i.parentNode.appendChild(r);else t(!1);if(window.addEventListener)window.addEventListener("load",a,!1);else if(window.attachEvent)window.attachEvent("onload",a)}}(),"".length>0)if(e&&"performance"in e&&e.performance&&"function"==typeof e.performance.setResourceTimingBufferSize)e.performance.setResourceTimingBufferSize();!function(){if(BOOMR=e.BOOMR||{},BOOMR.plugins=BOOMR.plugins||{},!BOOMR.plugins.AK){var n=""=="true"?1:0,t="",a="jwstvjqx2o5kqziiqb3q-f-48999fc27-clientnsv4-s.akamaihd.net",i="false"=="true"?2:1,o={"ak.v":"36","ak.cp":"87563","ak.ai":parseInt("165106",10),"ak.ol":"0","ak.cr":11,"ak.ipv":4,"ak.proto":"h2","ak.rid":"208b9d63","ak.r":43514,"ak.a2":n,"ak.m":"dscx","ak.n":"essl","ak.bpcip":"77.165.58.0","ak.cport":51967,"ak.gh":"23.209.124.154","ak.quicv":"","ak.tlsv":"tls1.3","ak.0rtt":"","ak.csrc":"-","ak.acc":"","ak.t":"1695055991","ak.ak":"hOBiQwZUYzCg5VSAfCLimQ==vTvvCcS7yUvMFUwhArQjprHmS4SyRkG4kcqkubHf0SBAfLCipu8Z3GPJ9e1GyMuPUYO3XEA5R7RpN9uJTq4BkLQbssemRdXUWVprB4rKDAEYqcmRCULA0ABeQ3qfODyP2aGzM6krlmzdUN3sDRsfq+Nqtt/d3qfpf8l9Q/pZ/isYuJ22cZkbqcxVDrtC3ZfyKoGxC4nBvQYyr+3eRkNd8Mn4h1+thrx2qiKO5Edi+DH9ERMG1glOjsIjPS83+W6/oMyxNe216KPSO85XtLtxCjStIG+EsWHRDNn8MY7U1+NNOw66FTYI/LJSo1dChisD40fj1vqVfOoGWFXZRyiJ2eXHK16Azfupm2/vPIH7sRubbBznZp4fKYBBPzIHbMC7CN7dAPBdYhG7M0KpFWkrEWxSbdA9McNb2A+jwoaq3mo=","ak.pv":"262","ak.dpoabenc":"","ak.tf":i};if(""!==t)o["ak.ruds"]=t;var r={i:!1,av:function(n){var t="http.initiator";if(n&&(!n[t]||"spa_hard"===n[t]))o["ak.feo"]=void 0!==e.aFeoApplied?1:0,BOOMR.addVar(o)},rv:function(){var e=["ak.bpcip","ak.cport","ak.cr","ak.csrc","ak.gh","ak.ipv","ak.m","ak.n","ak.ol","ak.proto","ak.quicv","ak.tlsv","ak.0rtt","ak.r","ak.acc","ak.t","ak.tf"];BOOMR.removeVar(e)}};BOOMR.plugins.AK={akVars:o,akDNSPreFetchDomain:a,init:function(){if(!r.i){var e=BOOMR.subscribe;e("before_beacon",r.av,null,null),e("onbeacon",r.rv,null,null),r.i=!0}return this},is_complete:function(){return!0}}}}()}(window);</script></head> 136 137 <body class="f20 f20v1" style="opacity:0"> 138 <script src="https://tms.oracle.com/main/prod/utag.sync.js"></script> 139 140 <!-- Loading script asynchronously --> 141 <script type="text/javascript"> 142 (function (a, b, c, d) { 143 if (location.href.indexOf("tealium=dev") == -1) { 144 a = 'https://tms.oracle.com/main/prod/utag.js'; 145 } else { 146 a = 'https://tms.oracle.com/main/dev/utag.js'; 147 } 148 b = document; c = 'script'; d = b.createElement(c); d.src = a; d.type = 'text/java' + c; d.async = true; 149 a = b.getElementsByTagName(c)[0]; a.parentNode.insertBefore(d, a); 150 })(); 151 </script> 152 <div class="f20w1"> 153 154 155 <!-- U18v2 --> 156 <div class="u18 u18v2"> 157 158 <div id="u18skip2content"> 159 <ul> 160 <li><a id="u18skip2c" href="#maincontent">Skip to content</a></li> 161 <li><a id="u18acc" href="https://www.oracle.com/corporate/accessibility/">Accessibility Policy</a></li> 162 </ul> 163 </div> 164 165 <nav role="banner"> 166 <div class="u18w1 cwidth"> 167 168 <div class="u18w2"> 169 <div class="u18-logo"><a href="https://blogs.oracle.com"><span>Oracle</span></a></div> 170 <div class="u18-title"><a href="" class="blog-name"></a></div> 171 </div> 172 173 <div class="u18w3"> 174 175 <div class="u18-search"> 176 <div class="u18-searchlink"> 177 <a href="#search" aria-label="Open Search Field"><span>Search</span></a> 178 </div> 179 <div class="u18-searchform"> 180 <a class="u18-search-action" id="u18exitsearch" href="#exitsearch" 181 aria-label="Exit Search Field"><span>Exit Search Field</span></a> 182 <div id="search" class="scs-slot" data-allowed-items="[ 'scs-contentsearch' ]"> 183 </div> 184 <a class="u18-search-action" id="u18clearsearch" href="#clearsearch" 185 aria-label="Clear Search Field" tabindex="0"><span>Clear Search Field</span></a> 186 </div> 187 </div> 188 <div class="u18-langdd u18-dd"> 189 <div class="u18-langselect u18-ddlink"> 190 <a href="#select-language" aria-label="Select Language" role="button"><span 191 class="globe">Select Language</span></a> 192 </div> 193 194 <div class="u18-langoptions u18-menu"> 195 <ul class="languagelist" id="languagelist"> 196 <li><a href="#" class="u18v1w5v1"></a></li> 197 </ul> 198 </div> 199 </div> 200 201 <div class="u18-menudd u18-dd"> 202 <div class="u18-hamburger u18-ddlink"> 203 <a href="#menu" aria-label="Menu" aria-haspopup="true" 204 role="button"><span>Menu</span></a> 205 </div> 206 <div class="u18-menuoptions u18-menu" aria-hidden="true"> 207 <div id="menu" class="slide-menu"> 208 <!-- <ul class="ul.icn-list" id="u18-subview"> --> 209 <div id="blogs-category-nav" class="scs-slot" 210 data-allowed-items="[ 'Blogs-Redwood-Category-Nav' ]"><div class="scs-row"><div class="scs-col" style="width: 100%;"><div id="f62eb3cd-6ac8-407d-9e06-69cbbc8d821e"><div class="scs-component-bounding-box"><!-- --> 211 <div> 212 <div class="scs-custom-component scs-component sampleComp-default-style" style="margin-top:5px;margin-right:5px;margin-bottom:5px;margin-left:5px;"> 213 <div class="scs-component-content" style="width:100%;"> 214 <div style="" class="scs-custom-component-wrapper"> 215 <div id="f62eb3cd-6ac8-407d-9e06-69cbbc8d821ecustomComponentDiv" data-scs-hydrate="true" data-asset-operation="view:CORE8B88E20204C04A0DADCEBC0499683C49"> 216 <div class="blogs-nav"> 217 218 <span class="h2-nav categories-text">CATEGORIES</span> 219 <ul class="ul.icn-list" id="u18-subview"> 220 221 222 223 <li class="mainMenu hasNoMenu"> 224 <a class="categ-menu" href="../category/lnx-announcements">Announcements</a> 225 </li> 226 227 228 229 230 <li class="mainMenu hasNoMenu"> 231 <a class="categ-menu" href="../category/lnx-events">Events</a> 232 </li> 233 234 235 236 237 <li class="mainMenu hasNoMenu"> 238 <a class="categ-menu" href="../category/lnx-oracle-cloud-infrastructure">Oracle Cloud Infrastructure</a> 239 </li> 240 241 242 243 244 <li class="mainMenu hasNoMenu"> 245 <a class="categ-menu" href="../category/lnx-partners">Partners</a> 246 </li> 247 248 249 250 251 <li class="mainMenu hasNoMenu"> 252 <a class="categ-menu" href="../category/lnx-perspectives">Perspectives</a> 253 </li> 254 255 256 257 258 <li class="mainMenu"> 259 <a class="hasMenu active categ-menu" href="../category/lnx-ksplice">Technologies</a> 260 <div class="sub-categories"> 261 <span class="back-btn" style="display: none;"><a href="javascript:void(0)">Back</a></span> 262 263 <ul style="margin: 0 !important"> 264 <li> 265 <a href="../category/lnx-technologies">Technologies</a> 266 </li> 267 <li> 268 <a href="../category/lnx-ksplice">Ksplice</a> 269 </li> 270 <li> 271 <a href="../category/lnx-linux-kernel-development">Linux Kernel Development</a> 272 </li> 273 <li> 274 <a href="../category/lnx-linux-toolchain-and-tracing">Linux Toolchain & Tracing</a> 275 </li> 276 </ul> 277 278 </div> 279 </li> 280 281 282 283 <li class="mainMenu hasNoMenu"> 284 <a class="categ-menu" href="../category/lnx-training">Training</a> 285 </li> 286 287 288 </ul> 289 290 <div class="u18-navdivider"></div> 291 292 293 <ul> 294 <li class="h2-nav related-content">RELATED CONTENT</li> 295 <div id="related-content"> 296 <li><a href="#">Wim Coekaert's blog</a></li> 297 <li><a href="#">Hardware Cert. List</a></li> 298 <li><a href="#">ISV Catalog </a></li> 299 <li><a href="#">Validated Configs </a></li> 300 <li><a href="#">Developers</a></li> 301 <li><a href="#">GitHub</a></li> 302 <li><a href="#">Open Source</a></li> 303 </div> 304 </ul> 305 <div class="u18-navdivider"></div> 306 </div> 307 308 309 <div class="hydrated-container" data-hydrated="{"contentId":"CORE8B88E20204C04A0DADCEBC0499683C49","categories":["Announcements","Events","Oracle Cloud Infrastructure","Partners","Perspectives","Technologies","Training"],"compiledSite":true}"></div> 310 <!-- <script> 311 312 function showCategories() { 313 document.getElementsByClassName("categ-menu").classList.remove("categ-active"); 314 document.getElementsByClassName("mainMenu").style.display = ""; 315 document.getElementsByClassName("back-btn").classList.remove("show"); 316 document.getElementsByClassName("categories-text").style.display = ""; 317 document.getElementsByClassName("back-btn").style.display = "none"; 318 document.getElementsByClassName("sub-categories").classList.remove("active"); 319 document.getElementsByClassName("hasMenu").classList.add("active"); 320 } 321 322 function showSubCategories(eventTarget) { 323 eventTarget.classList.add("categ-active"); 324 document.getElementsByClassName("mainMenu").not(eventTarget).each(function () { 325 this.style.display = "none"; 326 }); 327 document.querySelectorAll('.sub-categories.active li:first-child a').focus(); 328 document.getElementsByClassName("categories-text").style.display = "none"; 329 document.getElementsByClassName("categ-active").next('.back-btn').classList.add("show"); 330 eventTarget.parent('.mainMenu').style.display = ""; 331 eventTarget.siblings('.sub-categories').classList.add("active"); 332 eventTarget.classList.remove("active"); 333 document.getElementsByClassName("back-btn").style.display = ""; 334 document.querySelectorAll(".sub-categories.active li:last-child a").addEventListener('keydown', function (e) { 335 if (e.keyCode == 9) { 336 showCategories(); 337 document.getElementsByClassName("hasMenu").classList.add("active"); 338 } 339 }); 340 341 } 342 343 document.on('click', '.hasMenu', function (e) { 344 e.preventDefault(); 345 showSubCategories(this); 346 }); 347 document.on('click', '.back-btn', function (e) { 348 showCategories(); 349 350 }); 351 document.getElementsByClassName("back-btn").keydown(function (e) { 352 if (e.keyCode == 9) { 353 showCategories(); 354 $('.categ-menu.active').parent('.mainMenu').next('li').find('a').focus(); 355 } 356 }); 357 358 document.getElementsByClassName("hasMenu").keydown(function (e) { 359 if (e.keyCode == 9) { 360 showSubCategories(this); 361 } 362 }); 363 document.getElementsByClassName("mainMenu").keydown(function (e) { 364 if (e.shiftKey && e.keyCode == 9) { 365 let hasMenuElement = this.prev('li').find('a').classList.contains("hasMenu"); 366 if (hasMenuElement) { 367 e.preventDefault(); 368 showSubCategories(this.prev('li').find('a.hasMenu')); 369 document.querySelectorAll(".sub-categories.active li:last-child a").focus(); 370 } 371 } 372 }) 373 374 375 let hydrateData = document.getElementsByClassName("hydrated-container")[0].getAttribute('data-hydrated'); 376 377 if (hydrateData) { 378 379 var data = JSON.parse(hydrateData); 380 var postCategories = data.categories ? data.categories : []; 381 var metatags = document.getElementsByTagName("meta"); 382 for (var i = 0; i < metatags.length; i++) { 383 if (metatags[i].name === "category" && postCategories.length !== 0) { 384 document.getElementsByTagName("meta")[i].content = postCategories.join(); 385 } 386 if (metatags[i].name === "keywords" && postCategories.length !== 0) { 387 document.getElementsByTagName("meta")[i].content = postCategories.join(); 388 } 389 390 } 391 } 392 393 </script> --> 394 </div> 395 </div> 396 </div> 397 </div> 398 </div> 399 </div></div></div></div></div> 400 <!-- </ul> --> 401 </div> 402 403 <ul> 404 <li><a href="" class="homepage">Blogs Home</a></li> 405 <li><a href="" class="blogdirectory">Blogs Directory</a></li> 406 <li><a href="" class="authordirectory">Featured Authors</a></li> 407 408 <li><a href="" class="rss-link">RSS</a></li> 409 </ul> 410 </div> 411 412 </div> 413 414 </div> 415 416 </div> 417 </nav> 418 <a id="maincontent"></a> 419 </div> 420 <!-- /U18v2 --> 421 <!-- RH03v5 --> 422 <section class="rh03 rh03v5 rw-ocean-150bg rw-pattern16w rw-pattern-15p rw-strip rw-strip-custom social"> 423 <div class="rh03w1 cwidth social-wrapper"> 424 425 <!-- <div class="rh03bc"> 426 427 428 <div class="rh03bc"> 429 <div class="rh03bc1"> 430 <ol> 431 <li><a href="placeholder.html">Oracle blogs</a></li> 432 <li><a href="placeholder.html">Lorem ipsum dolor</a></li> 433 </ol> 434 </div> 435 </div> 436 437 438 </div> --> 439 440 <div class="rh03pgtitle"> 441 <div class="blog-name"></div> 442 <div class="rh03subtitle"> 443 <p></p> 444 </div> 445 <!-- <div id="bannerdescription"></div> --> 446 </div> 447 <div class="social-share-wrapper"> 448 <label id="social-share">Follow: </label> 449 <ul class="social-share" aria-labelledby="social-share"> 450 <li> 451 <a href="" title="Oracle blog RSS" class="icn-rss" target="_blank"> 452 <span class="sr-only">RSS</span> 453 </a> 454 </li> 455 <li> 456 <a href="" title="Oracle blog on Facebook" class="icn-facebook" id="facebook-url" target="_blank"> 457 <span class="sr-only">Facebook</span> 458 </a> 459 </li> 460 <li> 461 <a href="" title="Oracle blog on Twitter" class="icn-twitter" id="twitter-url" target="_blank"> 462 <span class="sr-only">Twitter</span> 463 </a> 464 </li> 465 <li> 466 <a href="" title="Oracle blog on Linkedin" class="icn-linkedin" id="linkedin-url" target="_blank"> 467 <span class="sr-only">LinkedIn</span> 468 </a> 469 </li> 470 <li> 471 <a href="" title="Oracle blog on Youtube" class="icn-youtube" id="youtube-url" target="_blank"> 472 <span class="sr-only">Youtube</span> 473 </a> 474 </li> 475 <li> 476 <a href="" title="Oracle blog on Instagram" class="icn-instagram" id="instagram-url" target="_blank"> 477 <span class="sr-only">Instagram</span> 478 </a> 479 </li> 480 </ul> 481 </div> 482 483 </div> 484 <div class="rh03customstrip" data-bgimg="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/img/rwstrip-blogs-fpo.png"></div> 485 </section> 486 <!-- /RH03v5 --> 487 <div class="scs-slot" id="post-id"><div class="scs-row"><div class="scs-col" style="width: 100%;"><div id="a37b49d5-e11f-4e1f-a5e0-fd37af71a288"><div class="scs-component-bounding-box"><!-- --> 488 <div> 489 <div class="scs-custom-component scs-component scs-component-default-style" style="margin-top:5px;margin-right:5px;margin-bottom:5px;margin-left:5px;"> 490 <div class="scs-component-content" style="width:100%;"> 491 <div style="" class="scs-custom-component-wrapper"> 492 <div id="a37b49d5-e11f-4e1f-a5e0-fd37af71a288customComponentDiv" data-scs-hydrate="true" data-scs-contenttype="Blog-Post" data-asset-operation="view:CORE8B88E20204C04A0DADCEBC0499683C49"> 493 <style> 494 .title { 495 background-color: #fff; 496 border: 1px solid #F1EFED; 497 border-radius: 22px; 498 max-width: 940px; 499 margin: 0 auto; 500 padding: 5px 25px; 501 } 502 </style> 503 <!-- RC81v1 --> 504 505 <section class="rc81 rc81v1 cpad"> 506 507 <div class="rc81w1 bwidth"> 508 509 <div class="rc81"> 510 <ul> 511 <li class="post-categories"><a href="../category/lnx-technologies" class="rc81accent"> Technologies<span>, </span> </a></li> 512 <li class="post-categories"><a href="../category/lnx-linux-kernel-development" class="rc81accent"> Linux Kernel Development<span>, </span> </a></li> 513 </ul> 514 515 </div> 516 <p class="rc81accent" id="categories"></p> 517 <h1>Syscall latency... and some uses of speculative execution</h1> 518 <span id="publishdate">September 12, 2023 |</span><span id="publishdate"> 23 minute read</span> 519 520 <div class="rc81sub "> 521 <img src="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/img/ui_defaultuserimage.jpg" alt=""> 522 523 <span><a id="postAuthorName" href="/authors/ankur-arora">Ankur Arora</a> 524 <div><span class="rc81title rw-neutral-200txt"></span> 525 526 </div> 527 </span></div> 528 529 530 <!-- 531 <div class="rc81photo"> 532 <img src=""> 533 </div> 534 --> 535 536 </div> 537 </section> 538 <!-- /RC81v1 --><!-- RC82v0 --> 539 <!-- /RC82v0 --><!-- RC86v0 --> 540 <section class="rc86 rc86v0 cpad"> 541 <div class="rc86w1 bwidth"> 542 <div class="rc86social"> 543 <a href="https://www.facebook.com/dialog/share?app_id=209650819625026&href=../post/syscall-latency" class="sharelink icn-img icn-facebook" aria-label="Share post on Facebook" data-sharetype="facebook"> 544 <!-- <span>Facebook</span> --> 545 </a> 546 <a href="https://twitter.com/share?url=../post/syscall-latency" class="sharelink icn-img icn-twitter" aria-label="Share post on Twitter" data-sharetype="twitter"> 547 <!-- <span>Twitter</span> --> 548 </a> 549 <a href="https://www.linkedin.com/shareArticle?url=../post/syscall-latency" aria-label="Share post on Linkedin" class="sharelink icn-img icn-linkedin" data-sharetype="linked-in"> 550 <!-- <span>LinkedIn</span> --> 551 </a> 552 <a href="placeholder.html" class="sharelink icn-img icn-email" aria-label="Share post on Email" data-sharetype="email"> 553 <!-- <span>Email</span> --> 554 </a> 555 </div> 556 </div> 557 </section> 558 <!-- /RC86v0 --> 559 560 <!-- RC84v0 --> 561 <section class="rc84v0 rc84zoom "> 562 <div class="rc84w1 bwidth"> 563 <div class="rc84zoomui"> 564 <b>Text Size <span id="rc84fs">100%</span>:</b> 565 <div> 566 <a href="#smaller-text" class="rc84-smaller" aria-label="decrease font size to 90%">-</a> 567 <a href="#larger-text" class="rc84-larger" aria-label="increase font size to 110%">+</a> 568 </div> 569 </div> 570 571 572 <div class="rc84post"> 573 574 <!-- RC84v1 --> 575 <section class="rc84 rc84v1"> 576 577 <h2 id="introduction">Introduction</h2> 578 579 <p>Moving from UEK5 to UEK6 brought about an unwelcome surprise: an increase in syscall latency on some x86 systems. The root cause, as we will see, was slightly slower evaluation of audit rules, which, given that they are evaluated for every syscall, is not great.</p> 580 581 <p>In this post we start off by exploring the root cause which turns out to not be UEK specific, it also impacts upstream kernels as well. Then we detail the fixes and how they take advantage of the speculative out-of-order nature of the CPU pipeline.</p> 582 583 <p>The changes, even though they target low-level optimizations, are quite straight-forward, almost trivial.</p> 584 585 <h3 id="background">Background</h3> 586 587 <p>Execution latency of the <code style="background:#eeeeee;border:1px solic #cccccc;">getpid()</code>[1] increased by about 15% (measured on an Intel Skylake-X system), from 191ns on UEK5, to 217ns on UEK6.</p> 588 589 <p>This was measured in the usual way:</p> 590 591 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">clock_gettime(CLOCK_MONOTONIC, &start); 592 for (i = 0; i < large_number; i++) 593 syscall(SYS_getpid); 594 clock_gettime(CLOCK_MONOTONIC, &stop);</pre> 595 596 <p>A quick <code style="background:#eeeeee;border:1px solic #cccccc;">perf record</code>, showed that almost all of the increased latency was in <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code> which was more expensive in UEK6.</p> 597 598 <p>Oracle Exadata, where this problem was seen has 37 audit rules that are evaluated in the syscall path. Since audit only wants to log unusual or exceptional events, the benchmark would evaluate these rules in every iteration, but never generate any output. Essentially, purely local computation that became slower without there having been any material changes to the audit code or in the audit rules.</p> 599 600 <h3 id="cpu-parameters">CPU-parameters</h3> 601 602 <p>Some Intel Skylake-X parameters that we'll make use of later:</p> 603 604 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">L1-load-latency: 4-6 cycles 605 L2-load-latency: 14 cycles 606 L1-cache-size: 32K (512 cachelines: 64 sets, 8 ways each) 607 608 ROB size: 224 micro-ops</pre> 609 610 <p>The parameters are taken from the <a href="https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html">Intel SDM</a>.</p> 611 612 <p><strong>Note:</strong> L1/L2 being the respective data-cache level and ROB, being the Reorder Buffer, where instructions are staged for in-order retirement.</p> 613 614 <h2 id="root-cause-analysis">Root cause analysis</h2> 615 616 <p>Drilling down with <code style="background:#eeeeee;border:1px solic #cccccc;">perf stat -d</code>:</p> 617 618 <p>UEK5 (191 ns):</p> 619 620 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;"># perf stat -d -r 5 ./getpid 621 # output normalized for a single getpid() call 622 623 677.9 cycles # 3.542 GHz 624 1635.0 instructions # 2.40 insn per cycle 625 325.0 branches 626 0.5 branch-misses # 0.16% of all branches 627 404.0 L1-dcache-loads 628 0.4 L1-dcache-load-misses # 0.10% of all L1-dcache accesses</pre> 629 630 <p>UEK6 (217ns):</p> 631 632 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;"># perf stat -d -r 5 ./getpid 633 # output normalized for a single getpid() call 634 635 770.4 cycles # 3.545 GHz 636 1652.0 instructions # 2.14 insn per cycle 637 332.2 branches 638 1.5 branch-misses # 0.45% of all branches 639 407.3 L1-dcache-loads 640 8.6 L1-dcache-load-misses # 2.13% of all L1-dcache accesses</pre> 641 642 <p>Comparing, this is an increase of ~100 cycles with the L1d-loads and instruction counts being almost identical across UEK5 and UEK6. This underscores the fact that audit code which forms the bulk of instructions executed hasn’t changed all that much.</p> 643 644 <p>The IPC is commensurately lower[2]. The proximal cause seems to be the increased L1d-load-misses and the one extra branch-miss.</p> 645 646 <p>These observations were confirmed via enough non-correlated runs (with intervening reboot for each) and so are statistically significant. The L1d-load-miss numbers are somewhat variable across boot cycles, but the trend is close to what we see above.</p> 647 648 <h3 id="audit_filter_syscall"><code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code></h3> 649 650 <p>From <code style="background:#eeeeee;border:1px solic #cccccc;">perf record</code> we know that the bulk of the increased runtime went to <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code>. The procedure itself is primarily a loop that walks the list of rules, calling <code style="background:#eeeeee;border:1px solic #cccccc;">audit_in_mask()</code> for each rule to check if it needs to be evaluated for the current syscall. For <code style="background:#eeeeee;border:1px solic #cccccc;">getpid()</code> the answer will be <code style="background:#eeeeee;border:1px solic #cccccc;">false</code> most of the time (32 of 37 times.)</p> 651 652 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">audit_filter_syscall(...) { 653 struct audit_entry *e; 654 struct audit_entry *ctx; 655 656 list = audit_filter_list[AUDIT_FILTER_EXIT]; 657 658 list_for_each_entry_rcu(e, list, list) { 659 660 if (audit_in_mask(&e->rule, ctx->major) && 661 audit_filter_rules(tsk, &e->rule, ctx, NULL, 662 &state, false, x)) { 663 rcu_read_unlock(); 664 ctx->current_state = state; 665 return state; 666 } 667 } 668 669 670 } 671 672 audit_in_mask(const struct audit_krule *rule, unsigned long val) { 673 if (val > 0xffffffff) 674 return false; 675 676 /* 677 * val contains the current syscall number. AUDIT_WORD does 678 * some bit shifting on it. 679 */ 680 word = AUDIT_WORD(val); 681 if (word >= AUDIT_BITMASK_SIZE) 682 return false; 683 684 bit = AUDIT_BIT(val); 685 686 /* 687 * The load in rule->mask[word] depends on the audit_krule (which 688 * hangs off the current rule entry) and the syscall number. 689 */ 690 return rule->mask[word] & bit; 691 } 692 audit_filter_rules(...) { 693 /* 694 * Large switch statement which we ignore for the rest of this 695 * analysis because, as we will see later, loads executed in it don't 696 * have an "interesting" alignment and so their latency should be easy 697 * enough to hide. 698 */ 699 }</pre> 700 701 <h3 id="memory-accesses">Memory accesses</h3> 702 703 <p>Next let’s look at the data structures accessed in the <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code> loop and where the L1d-load-misses might be coming from.</p> 704 705 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">/* Data structure layout annotated with size and cacheline occupancy 706 * information using pahole. */ 707 708 struct audit_entry { /* via audit_filter_list[AUDIT_FILTER_EXIT] */ 709 710 struct list_head list; /* 0 16 */ 711 struct callback_head rcu; /* 16 16 */ 712 struct audit_krule rule; /* 32 376 */ 713 ... 714 /* size: 408, cachelines: 7, members: 3 */ 715 /* last cacheline: 24 bytes */ 716 }; 717 718 struct audit_krule { /* inlined in struct audit_entry */ 719 ... 720 u32 mask[64]; /* 16 256 */ 721 ... 722 /* size: 376, cachelines: 6, members: 17 */ 723 /* last cacheline: 56 bytes */ 724 }; 725 726 struct audit_context { 727 ... 728 int major; /* 20 4 */ 729 ... 730 /* size: 920, cachelines: 15, members: 46 (slightly larger on UEK6) */ 731 /* sum members: 912, holes: 2, sum holes: 8 */ 732 /* last cacheline: 24 bytes */ 733 };</pre> 734 735 <p>The effective execution loop in <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code> (with cacheline access annotations):</p> 736 737 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">struct audit_entry *e = &audit_filter_list[AUDIT_FILTER_EXIT]; 738 739 for_each_iteration { 740 e = e->next; /* cacheline-0 of audit_entry */ 741 if (e == list) 742 jmp out; 743 if (audit_in_mask(e->rule.mask, /* cacheline-0 of audit_entry */ 744 ctx->major)) /* cacheline-0 of audit_context */ 745 audit_filter_rules(e->rule); 746 } 747 out:</pre> 748 749 <p>As the annotations above mention, there are a total of three loads:</p> 750 751 <ol type="1"> 752 <li>Pointer chasing in <code style="background:#eeeeee;border:1px solic #cccccc;">e->next</code>: the first cacheline of <code style="background:#eeeeee;border:1px solic #cccccc;">struct audit_entry</code>.</li> 753 <li><code style="background:#eeeeee;border:1px solic #cccccc;">e->rule.mask[]</code>: accesses the same cacheline as load (1) above.</li> 754 <li><code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code>: accesses the first cacheline of <code style="background:#eeeeee;border:1px solic #cccccc;">struct audit_context</code>.</li> 755 </ol> 756 757 <p>Loads (1) and (2) will access a total of 37 cachelines, corresponding to a rule per iteration. Also notice that every single basic block in the rest of the iteration (apart from some error checking in <code style="background:#eeeeee;border:1px solic #cccccc;">audit_in_mask()</code>) has data dependence on the evaluation of <code style="background:#eeeeee;border:1px solic #cccccc;">e=e->next</code>. Worse this is a loop carried dependency, so each iteration depends on the previous one.</p> 758 759 <p>The cacheline for load (3) is accessed once every iteration. This load is unnecessary, <code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code> contains the syscall number, which is a constant for the duration of the syscall. However, because the compiler’s alias analysis cannot prove that <code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code> is not mutilated, it does not get cached in a register. This also means that <code style="background:#eeeeee;border:1px solic #cccccc;">audit_in_mask()</code> will do out-of-bound validation checks related to <code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code> over and over.</p> 760 761 <p>Recalling the <code style="background:#eeeeee;border:1px solic #cccccc;">perf-stat -d</code> output above there are a total of around 400 L1d-loads for each <code style="background:#eeeeee;border:1px solic #cccccc;">getpid()</code> call. Of those, the loop does a total of 37*3 loads which map to a total of 38 unique cachelines.</p> 762 763 <p>Alright, I hear you think: granted, walking linked-lists is difficult, there are a lot of cachelines in a lot of iterations or whatever, life is hard and the compiler doesn’t know what it is doing[3]. Even given all of that, nothing here has changed from UEK5 to UEK6, so none of this explains why UEK6 would incur more L1d-load-misses[4].</p> 764 765 <p>Which is true, so that’s next.</p> 766 767 <h3 id="theory-of-the-case">Theory of the case</h3> 768 769 <p>From the background above, we know that the loop is pure computation, and purely local computation at that, so code changes elsewhere should have no effect. And there were no significant code changes from UEK5 to UEK6, so the loop is unchanged (which also applies to the generated assembly.)</p> 770 771 <p>Now insofar as L1d-load-misses are concerned: the number of cachelines accessed (from about 400 L1d-loads per <code style="background:#eeeeee;border:1px solic #cccccc;">getpid()</code> call, not all of which are to unique cachelines) amount to a number comfortably below the Skylake-X L1d-cache capacity of 512 cachelines. So this loop should not incur any capacity misses.</p> 772 773 <p>Which leaves conflict misses as the probable cause[5]. Skylake-X has an 8-way associative L1: if more than 8 loads in the loop map to the same cache-set some accesses would incur conflict misses.</p> 774 775 <p>Accesses in the loop and how they map to cache-sets:</p> 776 777 <ul> 778 <li><code style="background:#eeeeee;border:1px solic #cccccc;">struct audit_entry</code>: aligns at a 512B boundary, which limits it to cache-sets <code style="background:#eeeeee;border:1px solic #cccccc;">{0, 8, 16, ... 56}</code>, for a total of 8*8 cache-slots.</li> 779 <li><code style="background:#eeeeee;border:1px solic #cccccc;">struct audit_context</code>: aligns at a 1024B boundary, which resolves to cache-sets <code style="background:#eeeeee;border:1px solic #cccccc;">{0, 16, 32, 48}</code>, for a total of 4*8 cache-slots. As described earlier, this is a single cacheline which competes with a subset of the <code style="background:#eeeeee;border:1px solic #cccccc;">struct audit_entry</code> cachelines.</li> 780 </ul> 781 782 <p>Even then, this is 37 cachelines slotted into 64 slots and another slotting into 32 of those 64. This should be easy enough to satisfy, assuming that the kernel allocator has a reasonably sane distribution and isn’t skewed towards a particular set of cachelines (or is similarly skewed on both UEK5 and UEK6.)</p> 783 784 <h3 id="allocation-skew">Allocation skew</h3> 785 786 <p>If, allocations for <code style="background:#eeeeee;border:1px solic #cccccc;">struct audit_entry</code> were distributed uniformly, they would map into cache-sets uniformly, ending with similar populations across the cache-sets. This would give a cacheline-spread metric of ~0 (obtained by calculating the standard-deviation of populations across cache-sets.)</p> 787 788 <p>What we see:</p> 789 790 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">cacheline-spread on UEK5: 1.58 791 cacheline-spread on UEK6: 1.91</pre> 792 793 <p>(These results are from a large number (> 100) of non-correlated runs. <code style="background:#eeeeee;border:1px solic #cccccc;">auditd</code> allocates at boot, so this was done by rebooting between each run.)</p> 794 795 <p>From these numbers, UEK5 is far from a flat distribution, and UEK6 is somewhat worse, but not dispositively so. Additionally, a slight imbalance will not cause performance degradation: that happens only after cache conflicts kick in, which is after cache-set population crosses the associativity threshold.</p> 796 797 <p>To validate this, we measure how well cycles correlate[6] with 1) with L1d-misses, and 2) cacheline-spread:</p> 798 <style type="text/css">.divTable { 799 display: table; 800 width: 80%; 801 } 802 .divTableRow { 803 display: table-row; 804 } 805 .divTableHeading { 806 display: table-header-group; 807 background-color: #ddd; 808 font-weight: bold; 809 } 810 .divTableCell { 811 display: table-cell; 812 padding: 3px 10px; 813 border: 1px solid #999999; 814 } 815 </style> 816 <p> </p> 817 818 <div class="divTable"> 819 <div class="divTableHeading"> 820 <div class="divTableCell">Kernel</div> 821 822 <div class="divTableCell">cycles:L1d-misses</div> 823 824 <div class="divTableCell">cycles:cacheline-spread</div> 825 </div> 826 827 <div class="divTableRow"> 828 <div class="divTableCell">UEK5</div> 829 830 <div class="divTableCell">0.74</div> 831 832 <div class="divTableCell">0.22</div> 833 </div> 834 835 <div class="divTableRow"> 836 <div class="divTableCell">UEK6</div> 837 838 <div class="divTableCell">0.74</div> 839 840 <div class="divTableCell">0.61</div> 841 </div> 842 </div> 843 844 <p> </p> 845 846 <p>For both UEK5 and UEK6, “cycles:L1d-misses” is tightly correlated (though the value of 0.74 for both is happenstance) which makes sense. “cycles:cacheline-spread”, however, is well correlated only on UEK6, not UEK5. This suggests that the UEK6 allocator skew is meaningfully worse, enough to cause lower performance.</p> 847 848 <p>Alright, having beaten this dead horse enough, let’s figure out how to fix it next[7].</p> 849 850 <h2 id="speeding-it-up">Speeding it up</h2> 851 852 <p>To get back our lost performance, our task is simple: optimize a hot-loop[8] which is itself executed in the hot syscall path. Compounding the problem, the critical load in the loop is accessed via a linked list.</p> 853 854 <p>Stated like that, it sounds pretty bad. But, as we will see the structure of the problem helps quite a bit:</p> 855 856 <ol type="1"> 857 <li>On a sane system, the common-case is extremely common, syscalls are frequent, and audit logging is unusual. This means that low branch mispreds are not unusual and something we might even depend on.</li> 858 <li>We are optimizing a no-op loop: the loop walks a bunch of rules, does error checking, and decides if it needs to log. In the common-case, it will conclude that it doesn’t. (This is really (1) restated to stress the no-op nature of the loop.)</li> 859 </ol> 860 861 <p>A no-op loop implies that the code does not actually care about most of the values it computes. It just inches towards a foregone conclusion.</p> 862 863 <p>This it does (as all code does) by means of dependency chains that transform the input state to output. Here, most dependency chains are short and, are really <em>only used to predict the control flow</em>. The only long dependency chain, woven through all the loop iterations, is the one walking the linked-list.</p> 864 865 <p>Now, critically since the branches are predicted perfectly or almost so, the control flow can run quite a bit further than any loads and dependent computation. The control flow thus essentially feeds these loads and other instructions to the ROB, where they wait until resources/dependencies become available, compute the output from their chain which, to reiterate, will only be used to predict the control flow.</p> 866 867 <p>Given that the control flow is already feeding instructions from the correct direction, these are in effect orphan chains that eventually retire without anyone having cared for the output they compute or how long that took.</p> 868 869 <p>Except: this happy state continues only until we run into a resource constraint. For instance, the size of the ROB on Skylake-X is 224 entries and each loop iteration is ~20 instructions. This means instructions worth around 10 loop iterations can be present in the ROB. Now, given that instructions retire on x86 in-order, long running instructions (L1d-load-misses of course, but also L1d-load hits[9]) with long dependence chains would slow retirement down, even were control-flow to be predicted perfectly.</p> 870 871 <p>Bearing these observations in mind, our fixes will try to reduce the amount and cost of work per loop iteration. This allows the loop to retire as close to the gating latency of any long running instructions in the loop.</p> 872 873 <h3 id="cache-ctx-major-in-audit_filter_syscall">Cache <code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code> in <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code></h3> 874 875 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">@@ -785,13 +785,14 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, 876 { 877 struct audit_entry *e; 878 enum audit_state state; 879 + unsigned long major = ctx->major; 880 881 if (auditd_test_task(tsk)) 882 return AUDIT_DISABLED; 883 884 rcu_read_lock(); 885 list_for_each_entry_rcu(e, list, list) { 886 - if (audit_in_mask(&e->rule, ctx->major) && 887 + if (audit_in_mask(&e->rule, major) && 888 audit_filter_rules(tsk, &e->rule, ctx, NULL, 889 &state, false)) { 890 rcu_read_unlock();</pre> 891 892 <p>Caching <code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code> in a local variable helps in two ways:</p> 893 894 <ul> 895 <li>Explicitly indicates to the compiler that there are no stores to the cached value. <code style="background:#eeeeee;border:1px solic #cccccc;">audit_in_mask()</code> operates on <code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code> doing some bit-shifting and error checking. Now that the compiler knows that <code style="background:#eeeeee;border:1px solic #cccccc;">major</code> is not modified, it can hoist most of that logic out of the loop so it is not reevaluated over-and-over in every loop iteration.</li> 896 <li>As described earlier, <code style="background:#eeeeee;border:1px solic #cccccc;">struct audit_context</code> has similar natural alignment concerns as <code style="background:#eeeeee;border:1px solic #cccccc;">struct audit_entry</code>. Allowing the compiler to cache <code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code> in a register (or on the stack) reduces one potential source of contention.</li> 897 </ul> 898 899 <p>With this change the number of instructions executed/loop-iteration reduce by 8 (of 20.) Note that most of those were almost free ALU instructions.</p> 900 901 <p>L1d-loads: we removed one L1d-load but added two (due to the compiler now spilling and reloading some state to/from the stack.) However, given that stack accesses are much less likely to have conflicting alignment constraints, the increased loads are less of a concern than the one we got rid of.</p> 902 903 <p>cycles: improve by about 40 cycles. This is because the greater room in the ROB allows our almost perfect branch prediction to speculatively run even further ahead of other instructions.</p> 904 905 <p>Change in latency for UEK6:</p> 906 907 <div class="divTable"> 908 <div class="divTableHeading"> 909 <div class="divTableCell">Version</div> 910 911 <div class="divTableCell">Min<br> 912 (ns)</div> 913 914 <div class="divTableCell">Mean<br> 915 (ns)</div> 916 917 <div class="divTableCell">Median<br> 918 (ns)</div> 919 920 <div class="divTableCell">Max<br> 921 (ns)</div> 922 </div> 923 924 <div class="divTableRow"> 925 <div class="divTableCell">baseline</div> 926 927 <div class="divTableCell">196.26</div> 928 929 <div class="divTableCell">212.00</div> 930 931 <div class="divTableCell">207.80</div> 932 933 <div class="divTableCell">240.52</div> 934 </div> 935 936 <div class="divTableRow"> 937 <div class="divTableCell">ctx->major</div> 938 939 <div class="divTableCell">183.50</div> 940 941 <div class="divTableCell">201.41</div> 942 943 <div class="divTableCell">198.80</div> 944 945 <div class="divTableCell">226.93</div> 946 </div> 947 </div> 948 949 <p> </p> 950 951 <p>From the min-max range, there is a rather large variation in latency that’s caused by variations in allocation resulting in high or low cacheline-spread. In almost all cases though, the latency improves by ~10ns or thereabouts.</p> 952 953 <p>That said, after removing 8 instructions and one load (and adding two less consequential loads), the performance gain is rather miniscule: ~1 cycle/iteration. Just that the loop executes 37 times, so we make it up in volume.</p> 954 955 <p>More details (<code style="background:#eeeeee;border:1px solic #cccccc;">perf-stat</code> and the before/after versions of the generated code) in <a href="https://github.com/oracle/linux-uek/commit/87a39a3d2ca9a5c7e4d35e4cf4b839c53cc0678d">UEK6 commit-1</a> and in <a href="https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/kernel/auditsc.c?id=069545997510833281f45f83e097017b9fef19b7">Upstream commit-1</a>.</p> 956 957 <h3 id="annotate-branch-direction-for-audit_in_mask">Annotate branch direction for <code style="background:#eeeeee;border:1px solic #cccccc;">audit_in_mask()</code></h3> 958 959 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">@@ -790,12 +790,13 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, 960 rcu_read_lock(); 961 list_for_each_entry_rcu(e, list, list) { 962 - if (audit_in_mask(&e->rule, major) && 963 - audit_filter_rules(tsk, &e->rule, ctx, NULL, 964 - &state, false)) { 965 ... 966 + if (unlikely(audit_in_mask(&e->rule, major))) { 967 + if (audit_filter_rules(tsk, &e->rule, ctx, NULL, 968 + &state, false)) {</pre> 969 970 <p>Annotate <code style="background:#eeeeee;border:1px solic #cccccc;">audit_in_mask()</code> as <code style="background:#eeeeee;border:1px solic #cccccc;">unlikely()</code> to allow the compiler to pessimize the call to <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_rules()</code>. Two reasons for this change:</p> 971 972 <ul> 973 <li>The primary motivation was to get rid of the extra branch mispred. This change succeeds in that task but it is unclear why: there’s no significant change in the basic-block structure. The only change is from a branch inversion due to the unlikely clause.</li> 974 <li>The branch inversion means that the not-taken direction is chosen more often: 32/37 times (changing from 5/37 earlier.) The issue-latency for not-taken branches is 0.5-1 cycles, for taken branches 1-2 cycles[10] is slightly cheaper.</li> 975 </ul> 976 977 <p>L1d-loads: reduce by 2 for each loop iteration. This is because the spills and reloads introduced in the “Cache <code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code>…” patch have now been shifted to the unlikely path (the prologue and epilogue of the <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_rules()</code> call.)</p> 978 979 <p>cycles: performance improves on average by ~30 cycles/call.</p> 980 981 <p>Change in latency for UEK6:</p> 982 983 <div class="divTable"> 984 <div class="divTableHeading"> 985 <div class="divTableCell">Version</div> 986 987 <div class="divTableCell">Min<br> 988 (ns)</div> 989 990 <div class="divTableCell">Mean<br> 991 (ns)</div> 992 993 <div class="divTableCell">Median<br> 994 (ns)</div> 995 996 <div class="divTableCell">Max<br> 997 (ns)</div> 998 </div> 999 1000 <div class="divTableRow"> 1001 <div class="divTableCell">ctx->major</div> 1002 1003 <div class="divTableCell">183.50</div> 1004 1005 <div class="divTableCell">201.41</div> 1006 1007 <div class="divTableCell">198.80</div> 1008 1009 <div class="divTableCell">226.93</div> 1010 </div> 1011 1012 <div class="divTableRow"> 1013 <div class="divTableCell">ctx->major+annot</div> 1014 1015 <div class="divTableCell">165.26</div> 1016 1017 <div class="divTableCell">188.72</div> 1018 1019 <div class="divTableCell">184.25</div> 1020 1021 <div class="divTableCell">230.34</div> 1022 </div> 1023 </div> 1024 1025 <p> </p> 1026 1027 <p>More details (<code style="background:#eeeeee;border:1px solic #cccccc;">perf-stat</code> and the before/after versions of the generated code) in <a href="https://github.com/oracle/linux-uek/commit/0288dbdbfb5768ad8ae8a445c72f523bcb99eca0">UEK6 commit-2</a>.</p> 1028 1029 <h3 id="remove-static-linkage-from-audit_filter_syscall">Remove static linkage from <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code></h3> 1030 1031 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">@@ -777,7 +777,7 @@ static bool audit_in_mask(const struct audit_krule *rule, unsigned long 1032 * also not high enough that we already know we have to write an audit 1033 * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). 1034 */ 1035 -static enum audit_state audit_filter_syscall(struct task_struct *tsk, 1036 +enum audit_state audit_filter_syscall(struct task_struct *tsk, 1037 struct audit_context *ctx, 1038 struct list_head *list)</pre> 1039 1040 <p><code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code> is only used locally in the file and so is marked <code style="background:#eeeeee;border:1px solic #cccccc;">static</code>. Additionally, it’s only ever called with a fixed <code style="background:#eeeeee;border:1px solic #cccccc;">list</code> value of <code style="background:#eeeeee;border:1px solic #cccccc;">&audit_filter_list[AUDIT_FILTER_EXIT])</code>.</p> 1041 1042 <p>GCC’s constant propagation pass makes use of these two things to, quite reasonably, const-propagate the third argument to the point of use.</p> 1043 1044 <p>This causes the exit check in the <code style="background:#eeeeee;border:1px solic #cccccc;">list_for_each</code> loop to look like this:</p> 1045 1046 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">audit_filter_syscall.constprop.18(task, ctx): 1047 0: 48 8b 1b mov (%rbx),%rbx 1048 3: 48 81 fb e0 67 ac 82 cmp $0xffffffff82ac67e0,%rbx 1049 ffffffff8118b5ed: R_X86_64_32S audit_filter_list+0x40 1050 10: 75 e2 jne start_iter</pre> 1051 1052 <p>while, without const-propagation it would have looked like this:</p> 1053 1054 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">audit_filter_syscall(task, ctx, list): 1055 0: 48 8b 1b mov (%rbx),%rbx 1056 3: 4c 39 e3 cmp %r12,%rbx 1057 6: 75 e6 jne start_iter</pre> 1058 1059 <p>Now either one ought to be alright, both <code style="background:#eeeeee;border:1px solic #cccccc;">cmp imm32,r</code> and <code style="background:#eeeeee;border:1px solic #cccccc;">cmp r,r</code> forms are equivalent with a latency of 1 cycle, and both are a single micro-op each.</p> 1060 1061 <p>The second form of the <code style="background:#eeeeee;border:1px solic #cccccc;">cmp</code>, however, can be macro-op fused with the <code style="background:#eeeeee;border:1px solic #cccccc;">jne</code>; not entirely sure if the first form can be[11]. The second form is also denser, though that’s not a concern here.</p> 1062 1063 <p>Disallowing GCC from making assumptions about calling contexts by removing the <code style="background:#eeeeee;border:1px solic #cccccc;">static</code> linkage from <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code> forces it to pass the <code style="background:#eeeeee;border:1px solic #cccccc;">list</code> parameter in a register which results in a small performance improvement: ~20 cycles (about 0.5 cycles/loop iteration.)</p> 1064 1065 <p>Change in latency for UEK6:</p> 1066 1067 <div class="divTable"> 1068 <div class="divTableHeading"> 1069 <div class="divTableCell">Version</div> 1070 1071 <div class="divTableCell">Min<br> 1072 (ns)</div> 1073 1074 <div class="divTableCell">Mean<br> 1075 (ns)</div> 1076 1077 <div class="divTableCell">Median<br> 1078 (ns)</div> 1079 1080 <div class="divTableCell">Max<br> 1081 (ns)</div> 1082 </div> 1083 1084 <div class="divTableRow"> 1085 <div class="divTableCell">ctx->major+annot</div> 1086 1087 <div class="divTableCell">165.26</div> 1088 1089 <div class="divTableCell">188.72</div> 1090 1091 <div class="divTableCell">184.25</div> 1092 1093 <div class="divTableCell">230.34</div> 1094 </div> 1095 1096 <div class="divTableRow"> 1097 <div class="divTableCell">ctx->major+annot+extern</div> 1098 1099 <div class="divTableCell">159.88</div> 1100 1101 <div class="divTableCell">184.35</div> 1102 1103 <div class="divTableCell">177.62</div> 1104 1105 <div class="divTableCell">250.82</div> 1106 </div> 1107 </div> 1108 1109 <p> </p> 1110 1111 <p>More details (<code style="background:#eeeeee;border:1px solic #cccccc;">perf-stat</code> and the before/after versions of the generated code) in <a href="https://github.com/oracle/linux-uek/commit/5a74015e20bff63d1052359fbc2c3418e0f6bc4e">UEK6 commit-3</a> and, <a href="https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/kernel/auditsc.c?id=50979953c0c41e929e5f955800da68e1bb24c7ab">Upstream commit-3</a>.</p> 1112 1113 <h2 id="summary">Summary</h2> 1114 1115 <p>The audit subystem is fairly stable in the Linux kernel, not given to frequent changes. So it was puzzling when it became slower in recent kernels, and because a primary user is the syscall path, concerning[12].</p> 1116 1117 <p>The cause turned out to be higher skew in allocated buffers which results in more lopsided cache-set distribution.</p> 1118 1119 <p>The fixes compensate for the higher costs in the loop by taking advantage of the peculiarities of the execution path and optimizing for the speculative nature of the CPU pipeline.</p> 1120 1121 <p>The three patches, in sum reduce the overhead by about 30ns (~100 cycles).</p> 1122 1123 <p>Final <code style="background:#eeeeee;border:1px solic #cccccc;">perf stat -d -r 5</code> go from:</p> 1124 1125 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;"># perf stat -d -r 5 ./getpid 1126 # output normalized for a single getpid() call 1127 1128 cycles 761.65 ( +- 5.22% ) 1129 instructions 1639.17 ( +- 0.00% ) 1130 IPC 2.18 ( +- 5.50% ) 1131 branches 328.21 ( +- 0.00% ) 1132 branch-misses 1.37 ( +- 6.56% ) 1133 L1-dcache-loads 404.35 ( +- 0.00% ) 1134 L1-dcache-load-misses 7.99 ( +- 70.71% )</pre> 1135 1136 <p>to:</p> 1137 1138 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;"># perf stat -d -r 5 ./getpid 1139 # output normalized for a single getpid() call 1140 1141 cycles 669.09 ( +- 11.23% ) 1142 instructions 1342.04 ( +- 0.00% ) 1143 IPC 2.03 ( +- 9.85% ) 1144 branches 328.19 ( +- 0.00% ) 1145 branch-misses 0.56 ( +- 5.35% ) 1146 L1-dcache-loads 384.31 ( +- 0.00% ) 1147 L1-dcache-load-misses 5.77 ( +- 84.57% )</pre> 1148 1149 <p>This compares quite well to the UEK5-baseline:</p> 1150 1151 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;"># perf stat -d -r 5 ./getpid 1152 # output normalized for a single getpid() call 1153 1154 cycles 672.90 ( +- 1.65% ) 1155 instructions 1622.08 ( +- 0.00% ) 1156 IPC 2.41 ( +- 1.65% ) 1157 branches 321.20 ( +- 0.00% ) 1158 branch-misses 0.51 ( +- 0.00% ) 1159 L1-dcache-loads 401.32 ( +- 0.00% ) 1160 L1-dcache-load-misses 2.28 ( +- 59.62% )</pre> 1161 1162 <p>Note for non-Skylake-X architectures: Intel Icelake and AMD Milan (the other architectures tested) cope with L1d-load-misses much better so the baseline performance is much better.</p> 1163 1164 <p>With these patches, they only show a small improvement (~10ns): Icelake has a bigger L1d-cache (48K), and a much bigger ROB. Milan also has a bigger ROB and does memory renaming and bunch of other pipeline optimizations that limit the effect of these optimizations.</p> 1165 1166 <p><strong>Endnote:</strong> what I found personally instructive was how much C really is “a portable assembler” and the significant codegen (and performance) changes that can result from minimal changes to the code.</p> 1167 1168 <h2 id="references">References</h2> 1169 1170 <ol type="1"> 1171 <li> 1172 <p><code style="background:#eeeeee;border:1px solic #cccccc;">getpid()</code> has a minimal kernel execution path (only does a PID lookup), and so is generally used to measure the overhead of the syscall path.</p> 1173 </li> 1174 <li> 1175 <p>Comparing the IPC for the audit-only portion shows that a starker drop:</p> 1176 1177 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">UEK5: 1427.0 instructions # 3.41 insn per cycle 1178 UEK6: 1432.0 instructions # 2.84 insn per cycle</pre> 1179 </li> 1180 <li> 1181 <p>Alas no, alias analysis is an undecidable problem.</p> 1182 </li> 1183 <li> 1184 <p>Or for that matter, what causes the extra branch-miss.</p> 1185 </li> 1186 <li> 1187 <p>Another possibility is out-of-line code -- frequent interrupts, vmexits etc -- trashing the cache but from profiling these were a non-issue.</p> 1188 </li> 1189 <li> 1190 <p>Measured using the pearson-quotient(x, y): correlation coefficient between quantities x and y.</p> 1191 </li> 1192 <li> 1193 <p>You might notice that this analysis does not address the extra branch-miss. That's because I still have no clue what causes it.</p> 1194 </li> 1195 <li> 1196 <p>The correct fix would be to fix whatever ails the allocator. However, from a quick look at the changes that have gone into related code, it seems non-trivial to find a particular commit which points to the root cause of the skew (especially given that the skew is not constant, but varies from run-to-run.) Also, notably, the fixes described below also apply to UEK5, which means that even if UEK6 becomes faster, UEK5 will also improve somewhat.</p> 1197 </li> 1198 <li> 1199 <p>As mentioned in <a href="#cpu-parameters">CPU-parameters</a>, L1d-loads take 4-6 cycles on Skylake-X. We also know that in the good case (UEK5), this loop is capable of an IPC of 3.41 insn per cycle. So, hiding L1d-load latency is critical for good performance.</p> 1200 </li> 1201 <li> 1202 <p><a href="https://www.agner.org/optimize/instruction_tables.pdf%3E">https://www.agner.org/optimize/instruction_tables.pdf</a>, pg 298 (Skylake-X)</p> 1203 </li> 1204 <li> 1205 <p>The first form fused, needs three inputs: <code style="background:#eeeeee;border:1px solic #cccccc;">%rbx</code>, an <code style="background:#eeeeee;border:1px solic #cccccc;">imm32</code> encoding the distance to the address being compared, and an <code style="background:#eeeeee;border:1px solic #cccccc;">imm8</code> encoding the distance to the branch-dest; the second needs two registers: <code style="background:#eeeeee;border:1px solic #cccccc;">%rbx</code>, <code style="background:#eeeeee;border:1px solic #cccccc;">%r12</code> and only the <code style="background:#eeeeee;border:1px solic #cccccc;">imm8</code>.</p> 1206 </li> 1207 <li> 1208 <p>Just for context, a kernel build (x86-defconfig) makes an aggregate of 27M syscalls, with a syscall every 44us.</p> 1209 </li> 1210 </ol> 1211 1212 1213 </section> 1214 <!-- /RC84v1 --> 1215 1216 <!-- RC84v2 --> 1217 <section class="rc84v2 cpad"> 1218 <div class="rc84w1 cwidth"> 1219 1220 <div class="rc84bio"> 1221 <div class="rc84img"> 1222 <img src="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/img/ui_defaultuserimage.jpg" alt=""> 1223 </div> 1224 <div class="rc84blurb"> 1225 <div class="blogtile-w2-inner text-wrap"> 1226 <h4>Ankur Arora</h4> 1227 1228 <p></p> 1229 </div> 1230 </div> 1231 </div> 1232 1233 </div> 1234 </section> 1235 <!-- /RC84v2 --> 1236 1237 1238 </div> 1239 1240 </div> 1241 </section> 1242 <!-- /RC84v0 --> 1243 1244 <!-- /RC83v0 --> 1245 <input type="hidden" name="hiddenField" value="September 12, 2023" id="pubdate"> 1246 <input type="hidden" name="hiddenField" value="linux" id="primarychannel"> 1247 <div class="hydrate-container" data-hydrate="{"metaItems":[{"translatable":true,"createdDate":{"value":"2023-09-12T15:00:01.308Z","timezone":"UTC"},"fileExtension":"contentItem","name":"Ankur","description":"","language":"en-US","links":[{"href":"https://orasites-prodapp.cec.ocp.oraclecloud.com/content/published/api/v1.1/items/COREF415334566DE45208D79D6CD6FA88629?channelToken=3189ef66cf584820b5b19e6b10792d6f","rel":"self","method":"GET","mediaType":"application/json"}],"id":"COREF415334566DE45208D79D6CD6FA88629","updatedDate":{"value":"2023-09-12T15:00:01.308Z","timezone":"UTC"},"type":"Blog-Author","fields":{"twitter_handle":null,"facebook_url":null,"profile_image":null,"timezone":"EST","last_name":"Arora","bio":null,"linkedin_url":null,"middle_name":null,"first_name":"Ankur","job_title":null,"email":"ankur.a.arora"},"slug":"ankur-arora"}],"contentData":{"featured_image_display_option":null,"featured_image_alt_text":null,"attachments":null,"og_title":"Syscall latency... and some uses of speculative execution","featured_image_alternate_text":"","industry":null,"title":"Syscall latency... and some uses of speculative execution","body":"<!DOCTYPE html> <h2 id=\"introduction\">Introduction</h2>\n\n<p>Moving from UEK5 to UEK6 brought about an unwelcome surprise: an increase in syscall latency on some x86 systems. The root cause, as we will see, was slightly slower evaluation of audit rules, which, given that they are evaluated for every syscall, is not great.</p>\n\n<p>In this post we start off by exploring the root cause which turns out to not be UEK specific, it also impacts upstream kernels as well. Then we detail the fixes and how they take advantage of the speculative out-of-order nature of the CPU pipeline.</p>\n\n<p>The changes, even though they target low-level optimizations, are quite straight-forward, almost trivial.</p>\n\n<h3 id=\"background\">Background</h3>\n\n<p>Execution latency of the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">getpid()</code>[1] increased by about 15% (measured on an Intel Skylake-X system), from 191ns on UEK5, to 217ns on UEK6.</p>\n\n<p>This was measured in the usual way:</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\nclock_gettime(CLOCK_MONOTONIC, &amp;start);\nfor (i = 0; i &lt; large_number; i++)\n syscall(SYS_getpid);\nclock_gettime(CLOCK_MONOTONIC, &amp;stop);</pre>\n\n<p>A quick <code style=\"background:#eeeeee;border:1px solic #cccccc;\">perf record</code>, showed that almost all of the increased latency was in <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code> which was more expensive in UEK6.</p>\n\n<p>Oracle Exadata, where this problem was seen has 37 audit rules that are evaluated in the syscall path. Since audit only wants to log unusual or exceptional events, the benchmark would evaluate these rules in every iteration, but never generate any output. Essentially, purely local computation that became slower without there having been any material changes to the audit code or in the audit rules.</p>\n\n<h3 id=\"cpu-parameters\">CPU-parameters</h3>\n\n<p>Some Intel Skylake-X parameters that we&#39;ll make use of later:</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\nL1-load-latency: 4-6 cycles\nL2-load-latency: 14 cycles \nL1-cache-size: 32K (512 cachelines: 64 sets, 8 ways each)\n\nROB size: 224 micro-ops</pre>\n\n<p>The parameters are taken from the <a href=\"https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html\">Intel SDM</a>.</p>\n\n<p><strong>Note:</strong> L1/L2 being the respective data-cache level and ROB, being the Reorder Buffer, where instructions are staged for in-order retirement.</p>\n\n<h2 id=\"root-cause-analysis\">Root cause analysis</h2>\n\n<p>Drilling down with <code style=\"background:#eeeeee;border:1px solic #cccccc;\">perf stat -d</code>:</p>\n\n<p>UEK5 (191 ns):</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n# perf stat -d -r 5 ./getpid\n# output normalized for a single getpid() call\n\n 677.9 cycles # 3.542 GHz\n1635.0 instructions # 2.40 insn per cycle\n 325.0 branches\n 0.5 branch-misses # 0.16% of all branches\n 404.0 L1-dcache-loads\n 0.4 L1-dcache-load-misses # 0.10% of all L1-dcache accesses</pre>\n\n<p>UEK6 (217ns):</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n# perf stat -d -r 5 ./getpid\n# output normalized for a single getpid() call\n\n 770.4 cycles # 3.545 GHz\n1652.0 instructions # 2.14 insn per cycle\n 332.2 branches\n 1.5 branch-misses # 0.45% of all branches\n 407.3 L1-dcache-loads\n 8.6 L1-dcache-load-misses # 2.13% of all L1-dcache accesses</pre>\n\n<p>Comparing, this is an increase of ~100 cycles with the L1d-loads and instruction counts being almost identical across UEK5 and UEK6. This underscores the fact that audit code which forms the bulk of instructions executed hasn&rsquo;t changed all that much.</p>\n\n<p>The IPC is commensurately lower[2]. The proximal cause seems to be the increased L1d-load-misses and the one extra branch-miss.</p>\n\n<p>These observations were confirmed via enough non-correlated runs (with intervening reboot for each) and so are statistically significant. The L1d-load-miss numbers are somewhat variable across boot cycles, but the trend is close to what we see above.</p>\n\n<h3 id=\"audit_filter_syscall\"><code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code></h3>\n\n<p>From <code style=\"background:#eeeeee;border:1px solic #cccccc;\">perf record</code> we know that the bulk of the increased runtime went to <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code>. The procedure itself is primarily a loop that walks the list of rules, calling <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_in_mask()</code> for each rule to check if it needs to be evaluated for the current syscall. For <code style=\"background:#eeeeee;border:1px solic #cccccc;\">getpid()</code> the answer will be <code style=\"background:#eeeeee;border:1px solic #cccccc;\">false</code> most of the time (32 of 37 times.)</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\naudit_filter_syscall(...) {\n struct audit_entry *e; \n struct audit_entry *ctx;\n\n list = audit_filter_list[AUDIT_FILTER_EXIT]; \n\n list_for_each_entry_rcu(e, list, list) {\n\n if (audit_in_mask(&amp;e-&gt;rule, ctx-&gt;major) &amp;&amp; \n audit_filter_rules(tsk, &amp;e-&gt;rule, ctx, NULL,\n &amp;state, false, x)) { \n rcu_read_unlock(); \n ctx-&gt;current_state = state;\n return state;\n }\n }\n\n\n}\n \naudit_in_mask(const struct audit_krule *rule, unsigned long val) {\n if (val &gt; 0xffffffff)\n return false; \n\n /*\n * val contains the current syscall number. AUDIT_WORD does\n * some bit shifting on it.\n */\n word = AUDIT_WORD(val);\n if (word &gt;= AUDIT_BITMASK_SIZE)\n return false;\n\n bit = AUDIT_BIT(val);\n\n /*\n * The load in rule-&gt;mask[word] depends on the audit_krule (which\n * hangs off the current rule entry) and the syscall number.\n */\n return rule-&gt;mask[word] &amp; bit;\n}\naudit_filter_rules(...) {\n /*\n * Large switch statement which we ignore for the rest of this\n * analysis because, as we will see later, loads executed in it don&#39;t\n * have an &quot;interesting&quot; alignment and so their latency should be easy\n * enough to hide.\n */\n}</pre>\n\n<h3 id=\"memory-accesses\">Memory accesses</h3>\n\n<p>Next let&rsquo;s look at the data structures accessed in the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code> loop and where the L1d-load-misses might be coming from.</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n/* Data structure layout annotated with size and cacheline occupancy\n * information using pahole. */\n\nstruct audit_entry { /* via audit_filter_list[AUDIT_FILTER_EXIT] */\n\n struct list_head list; /* 0 16 */\n struct callback_head rcu; /* 16 16 */\n struct audit_krule rule; /* 32 376 */\n ...\n /* size: 408, cachelines: 7, members: 3 */\n /* last cacheline: 24 bytes */\n};\n\nstruct audit_krule { /* inlined in struct audit_entry */\n ...\n u32 mask[64]; /* 16 256 */\n ...\n /* size: 376, cachelines: 6, members: 17 */\n /* last cacheline: 56 bytes */\n};\n\nstruct audit_context {\n ...\n int major; /* 20 4 */\n ...\n /* size: 920, cachelines: 15, members: 46 (slightly larger on UEK6) */\n /* sum members: 912, holes: 2, sum holes: 8 */\n /* last cacheline: 24 bytes */\n};</pre>\n\n<p>The effective execution loop in <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code> (with cacheline access annotations):</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\nstruct audit_entry *e = &amp;audit_filter_list[AUDIT_FILTER_EXIT];\n\nfor_each_iteration {\n e = e-&gt;next; /* cacheline-0 of audit_entry */\n if (e == list)\n jmp out;\n if (audit_in_mask(e-&gt;rule.mask, /* cacheline-0 of audit_entry */\n ctx-&gt;major)) /* cacheline-0 of audit_context */\n audit_filter_rules(e-&gt;rule);\n}\nout:</pre>\n\n<p>As the annotations above mention, there are a total of three loads:</p>\n\n<ol type=\"1\">\n\t<li>Pointer chasing in <code style=\"background:#eeeeee;border:1px solic #cccccc;\">e-&gt;next</code>: the first cacheline of <code style=\"background:#eeeeee;border:1px solic #cccccc;\">struct audit_entry</code>.</li>\n\t<li><code style=\"background:#eeeeee;border:1px solic #cccccc;\">e-&gt;rule.mask[]</code>: accesses the same cacheline as load (1) above.</li>\n\t<li><code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code>: accesses the first cacheline of <code style=\"background:#eeeeee;border:1px solic #cccccc;\">struct audit_context</code>.</li>\n</ol>\n\n<p>Loads (1) and (2) will access a total of 37 cachelines, corresponding to a rule per iteration. Also notice that every single basic block in the rest of the iteration (apart from some error checking in <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_in_mask()</code>) has data dependence on the evaluation of <code style=\"background:#eeeeee;border:1px solic #cccccc;\">e=e-&gt;next</code>. Worse this is a loop carried dependency, so each iteration depends on the previous one.</p>\n\n<p>The cacheline for load (3) is accessed once every iteration. This load is unnecessary, <code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code> contains the syscall number, which is a constant for the duration of the syscall. However, because the compiler&rsquo;s alias analysis cannot prove that <code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code> is not mutilated, it does not get cached in a register. This also means that <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_in_mask()</code> will do out-of-bound validation checks related to <code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code> over and over.</p>\n\n<p>Recalling the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">perf-stat -d</code> output above there are a total of around 400 L1d-loads for each <code style=\"background:#eeeeee;border:1px solic #cccccc;\">getpid()</code> call. Of those, the loop does a total of 37*3 loads which map to a total of 38 unique cachelines.</p>\n\n<p>Alright, I hear you think: granted, walking linked-lists is difficult, there are a lot of cachelines in a lot of iterations or whatever, life is hard and the compiler doesn&rsquo;t know what it is doing[3]. Even given all of that, nothing here has changed from UEK5 to UEK6, so none of this explains why UEK6 would incur more L1d-load-misses[4].</p>\n\n<p>Which is true, so that&rsquo;s next.</p>\n\n<h3 id=\"theory-of-the-case\">Theory of the case</h3>\n\n<p>From the background above, we know that the loop is pure computation, and purely local computation at that, so code changes elsewhere should have no effect. And there were no significant code changes from UEK5 to UEK6, so the loop is unchanged (which also applies to the generated assembly.)</p>\n\n<p>Now insofar as L1d-load-misses are concerned: the number of cachelines accessed (from about 400 L1d-loads per <code style=\"background:#eeeeee;border:1px solic #cccccc;\">getpid()</code> call, not all of which are to unique cachelines) amount to a number comfortably below the Skylake-X L1d-cache capacity of 512 cachelines. So this loop should not incur any capacity misses.</p>\n\n<p>Which leaves conflict misses as the probable cause[5]. Skylake-X has an 8-way associative L1: if more than 8 loads in the loop map to the same cache-set some accesses would incur conflict misses.</p>\n\n<p>Accesses in the loop and how they map to cache-sets:</p>\n\n<ul>\n\t<li><code style=\"background:#eeeeee;border:1px solic #cccccc;\">struct audit_entry</code>: aligns at a 512B boundary, which limits it to cache-sets <code style=\"background:#eeeeee;border:1px solic #cccccc;\">{0, 8, 16, ... 56}</code>, for a total of 8*8 cache-slots.</li>\n\t<li><code style=\"background:#eeeeee;border:1px solic #cccccc;\">struct audit_context</code>: aligns at a 1024B boundary, which resolves to cache-sets <code style=\"background:#eeeeee;border:1px solic #cccccc;\">{0, 16, 32, 48}</code>, for a total of 4*8 cache-slots. As described earlier, this is a single cacheline which competes with a subset of the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">struct audit_entry</code> cachelines.</li>\n</ul>\n\n<p>Even then, this is 37 cachelines slotted into 64 slots and another slotting into 32 of those 64. This should be easy enough to satisfy, assuming that the kernel allocator has a reasonably sane distribution and isn&rsquo;t skewed towards a particular set of cachelines (or is similarly skewed on both UEK5 and UEK6.)</p>\n\n<h3 id=\"allocation-skew\">Allocation skew</h3>\n\n<p>If, allocations for <code style=\"background:#eeeeee;border:1px solic #cccccc;\">struct audit_entry</code> were distributed uniformly, they would map into cache-sets uniformly, ending with similar populations across the cache-sets. This would give a cacheline-spread metric of ~0 (obtained by calculating the standard-deviation of populations across cache-sets.)</p>\n\n<p>What we see:</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\ncacheline-spread on UEK5: 1.58\ncacheline-spread on UEK6: 1.91</pre>\n\n<p>(These results are from a large number (&gt; 100) of non-correlated runs. <code style=\"background:#eeeeee;border:1px solic #cccccc;\">auditd</code> allocates at boot, so this was done by rebooting between each run.)</p>\n\n<p>From these numbers, UEK5 is far from a flat distribution, and UEK6 is somewhat worse, but not dispositively so. Additionally, a slight imbalance will not cause performance degradation: that happens only after cache conflicts kick in, which is after cache-set population crosses the associativity threshold.</p>\n\n<p>To validate this, we measure how well cycles correlate[6] with 1) with L1d-misses, and 2) cacheline-spread:</p>\n<style type=\"text/css\">.divTable {\n display: table;\n width: 80%;\n}\n.divTableRow {\n display: table-row;\n}\n.divTableHeading {\n display: table-header-group;\n background-color: #ddd;\n font-weight: bold;\n}\n.divTableCell {\n display: table-cell;\n padding: 3px 10px;\n border: 1px solid #999999;\n}\n</style>\n<p>&nbsp;</p>\n\n<div class=\"divTable\">\n<div class=\"divTableHeading\">\n<div class=\"divTableCell\">Kernel</div>\n\n<div class=\"divTableCell\">cycles:L1d-misses</div>\n\n<div class=\"divTableCell\">cycles:cacheline-spread</div>\n</div>\n\n<div class=\"divTableRow\">\n<div class=\"divTableCell\">UEK5</div>\n\n<div class=\"divTableCell\">0.74</div>\n\n<div class=\"divTableCell\">0.22</div>\n</div>\n\n<div class=\"divTableRow\">\n<div class=\"divTableCell\">UEK6</div>\n\n<div class=\"divTableCell\">0.74</div>\n\n<div class=\"divTableCell\">0.61</div>\n</div>\n</div>\n\n<p>&nbsp;</p>\n\n<p>For both UEK5 and UEK6, &ldquo;cycles:L1d-misses&rdquo; is tightly correlated (though the value of 0.74 for both is happenstance) which makes sense. &ldquo;cycles:cacheline-spread&rdquo;, however, is well correlated only on UEK6, not UEK5. This suggests that the UEK6 allocator skew is meaningfully worse, enough to cause lower performance.</p>\n\n<p>Alright, having beaten this dead horse enough, let&rsquo;s figure out how to fix it next[7].</p>\n\n<h2 id=\"speeding-it-up\">Speeding it up</h2>\n\n<p>To get back our lost performance, our task is simple: optimize a hot-loop[8] which is itself executed in the hot syscall path. Compounding the problem, the critical load in the loop is accessed via a linked list.</p>\n\n<p>Stated like that, it sounds pretty bad. But, as we will see the structure of the problem helps quite a bit:</p>\n\n<ol type=\"1\">\n\t<li>On a sane system, the common-case is extremely common, syscalls are frequent, and audit logging is unusual. This means that low branch mispreds are not unusual and something we might even depend on.</li>\n\t<li>We are optimizing a no-op loop: the loop walks a bunch of rules, does error checking, and decides if it needs to log. In the common-case, it will conclude that it doesn&rsquo;t. (This is really (1) restated to stress the no-op nature of the loop.)</li>\n</ol>\n\n<p>A no-op loop implies that the code does not actually care about most of the values it computes. It just inches towards a foregone conclusion.</p>\n\n<p>This it does (as all code does) by means of dependency chains that transform the input state to output. Here, most dependency chains are short and, are really <em>only used to predict the control flow</em>. The only long dependency chain, woven through all the loop iterations, is the one walking the linked-list.</p>\n\n<p>Now, critically since the branches are predicted perfectly or almost so, the control flow can run quite a bit further than any loads and dependent computation. The control flow thus essentially feeds these loads and other instructions to the ROB, where they wait until resources/dependencies become available, compute the output from their chain which, to reiterate, will only be used to predict the control flow.</p>\n\n<p>Given that the control flow is already feeding instructions from the correct direction, these are in effect orphan chains that eventually retire without anyone having cared for the output they compute or how long that took.</p>\n\n<p>Except: this happy state continues only until we run into a resource constraint. For instance, the size of the ROB on Skylake-X is 224 entries and each loop iteration is ~20 instructions. This means instructions worth around 10 loop iterations can be present in the ROB. Now, given that instructions retire on x86 in-order, long running instructions (L1d-load-misses of course, but also L1d-load hits[9]) with long dependence chains would slow retirement down, even were control-flow to be predicted perfectly.</p>\n\n<p>Bearing these observations in mind, our fixes will try to reduce the amount and cost of work per loop iteration. This allows the loop to retire as close to the gating latency of any long running instructions in the loop.</p>\n\n<h3 id=\"cache-ctx-major-in-audit_filter_syscall\">Cache <code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code> in <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code></h3>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n@@ -785,13 +785,14 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,\n {\n struct audit_entry *e;\n enum audit_state state;\n+ unsigned long major = ctx-&gt;major;\n\n if (auditd_test_task(tsk))\n return AUDIT_DISABLED;\n\n rcu_read_lock();\n list_for_each_entry_rcu(e, list, list) {\n- if (audit_in_mask(&amp;e-&gt;rule, ctx-&gt;major) &amp;&amp;\n+ if (audit_in_mask(&amp;e-&gt;rule, major) &amp;&amp;\n audit_filter_rules(tsk, &amp;e-&gt;rule, ctx, NULL,\n &amp;state, false)) {\n rcu_read_unlock();</pre>\n\n<p>Caching <code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code> in a local variable helps in two ways:</p>\n\n<ul>\n\t<li>Explicitly indicates to the compiler that there are no stores to the cached value. <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_in_mask()</code> operates on <code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code> doing some bit-shifting and error checking. Now that the compiler knows that <code style=\"background:#eeeeee;border:1px solic #cccccc;\">major</code> is not modified, it can hoist most of that logic out of the loop so it is not reevaluated over-and-over in every loop iteration.</li>\n\t<li>As described earlier, <code style=\"background:#eeeeee;border:1px solic #cccccc;\">struct audit_context</code> has similar natural alignment concerns as <code style=\"background:#eeeeee;border:1px solic #cccccc;\">struct audit_entry</code>. Allowing the compiler to cache <code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code> in a register (or on the stack) reduces one potential source of contention.</li>\n</ul>\n\n<p>With this change the number of instructions executed/loop-iteration reduce by 8 (of 20.) Note that most of those were almost free ALU instructions.</p>\n\n<p>L1d-loads: we removed one L1d-load but added two (due to the compiler now spilling and reloading some state to/from the stack.) However, given that stack accesses are much less likely to have conflicting alignment constraints, the increased loads are less of a concern than the one we got rid of.</p>\n\n<p>cycles: improve by about 40 cycles. This is because the greater room in the ROB allows our almost perfect branch prediction to speculatively run even further ahead of other instructions.</p>\n\n<p>Change in latency for UEK6:</p>\n\n<div class=\"divTable\">\n<div class=\"divTableHeading\">\n<div class=\"divTableCell\">Version</div>\n\n<div class=\"divTableCell\">Min<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Mean<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Median<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Max<br />\n(ns)</div>\n</div>\n\n<div class=\"divTableRow\">\n<div class=\"divTableCell\">baseline</div>\n\n<div class=\"divTableCell\">196.26</div>\n\n<div class=\"divTableCell\">212.00</div>\n\n<div class=\"divTableCell\">207.80</div>\n\n<div class=\"divTableCell\">240.52</div>\n</div>\n\n<div class=\"divTableRow\">\n<div class=\"divTableCell\">ctx-&gt;major</div>\n\n<div class=\"divTableCell\">183.50</div>\n\n<div class=\"divTableCell\">201.41</div>\n\n<div class=\"divTableCell\">198.80</div>\n\n<div class=\"divTableCell\">226.93</div>\n</div>\n</div>\n\n<p>&nbsp;</p>\n\n<p>From the min-max range, there is a rather large variation in latency that&rsquo;s caused by variations in allocation resulting in high or low cacheline-spread. In almost all cases though, the latency improves by ~10ns or thereabouts.</p>\n\n<p>That said, after removing 8 instructions and one load (and adding two less consequential loads), the performance gain is rather miniscule: ~1 cycle/iteration. Just that the loop executes 37 times, so we make it up in volume.</p>\n\n<p>More details (<code style=\"background:#eeeeee;border:1px solic #cccccc;\">perf-stat</code> and the before/after versions of the generated code) in <a href=\"https://github.com/oracle/linux-uek/commit/87a39a3d2ca9a5c7e4d35e4cf4b839c53cc0678d\">UEK6 commit-1</a> and in <a href=\"https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/kernel/auditsc.c?id=069545997510833281f45f83e097017b9fef19b7\">Upstream commit-1</a>.</p>\n\n<h3 id=\"annotate-branch-direction-for-audit_in_mask\">Annotate branch direction for <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_in_mask()</code></h3>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n@@ -790,12 +790,13 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,\n rcu_read_lock();\n list_for_each_entry_rcu(e, list, list) {\n - if (audit_in_mask(&amp;e-&gt;rule, major) &amp;&amp;\n - audit_filter_rules(tsk, &amp;e-&gt;rule, ctx, NULL,\n - &amp;state, false)) {\n ...\n + if (unlikely(audit_in_mask(&amp;e-&gt;rule, major))) {\n + if (audit_filter_rules(tsk, &amp;e-&gt;rule, ctx, NULL,\n + &amp;state, false)) {</pre>\n\n<p>Annotate <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_in_mask()</code> as <code style=\"background:#eeeeee;border:1px solic #cccccc;\">unlikely()</code> to allow the compiler to pessimize the call to <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_rules()</code>. Two reasons for this change:</p>\n\n<ul>\n\t<li>The primary motivation was to get rid of the extra branch mispred. This change succeeds in that task but it is unclear why: there&rsquo;s no significant change in the basic-block structure. The only change is from a branch inversion due to the unlikely clause.</li>\n\t<li>The branch inversion means that the not-taken direction is chosen more often: 32/37 times (changing from 5/37 earlier.) The issue-latency for not-taken branches is 0.5-1 cycles, for taken branches 1-2 cycles[10] is slightly cheaper.</li>\n</ul>\n\n<p>L1d-loads: reduce by 2 for each loop iteration. This is because the spills and reloads introduced in the &ldquo;Cache <code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code>&hellip;&rdquo; patch have now been shifted to the unlikely path (the prologue and epilogue of the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_rules()</code> call.)</p>\n\n<p>cycles: performance improves on average by ~30 cycles/call.</p>\n\n<p>Change in latency for UEK6:</p>\n\n<div class=\"divTable\">\n<div class=\"divTableHeading\">\n<div class=\"divTableCell\">Version</div>\n\n<div class=\"divTableCell\">Min<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Mean<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Median<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Max<br />\n(ns)</div>\n</div>\n\n<div class=\"divTableRow\">\n<div class=\"divTableCell\">ctx-&gt;major</div>\n\n<div class=\"divTableCell\">183.50</div>\n\n<div class=\"divTableCell\">201.41</div>\n\n<div class=\"divTableCell\">198.80</div>\n\n<div class=\"divTableCell\">226.93</div>\n</div>\n\n<div class=\"divTableRow\">\n<div class=\"divTableCell\">ctx-&gt;major+annot</div>\n\n<div class=\"divTableCell\">165.26</div>\n\n<div class=\"divTableCell\">188.72</div>\n\n<div class=\"divTableCell\">184.25</div>\n\n<div class=\"divTableCell\">230.34</div>\n</div>\n</div>\n\n<p>&nbsp;</p>\n\n<p>More details (<code style=\"background:#eeeeee;border:1px solic #cccccc;\">perf-stat</code> and the before/after versions of the generated code) in <a href=\"https://github.com/oracle/linux-uek/commit/0288dbdbfb5768ad8ae8a445c72f523bcb99eca0\">UEK6 commit-2</a>.</p>\n\n<h3 id=\"remove-static-linkage-from-audit_filter_syscall\">Remove static linkage from <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code></h3>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n@@ -777,7 +777,7 @@ static bool audit_in_mask(const struct audit_krule *rule, unsigned long\n * also not high enough that we already know we have to write an audit\n * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).\n */\n -static enum audit_state audit_filter_syscall(struct task_struct *tsk,\n +enum audit_state audit_filter_syscall(struct task_struct *tsk,\n struct audit_context *ctx,\n struct list_head *list)</pre>\n\n<p><code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code> is only used locally in the file and so is marked <code style=\"background:#eeeeee;border:1px solic #cccccc;\">static</code>. Additionally, it&rsquo;s only ever called with a fixed <code style=\"background:#eeeeee;border:1px solic #cccccc;\">list</code> value of <code style=\"background:#eeeeee;border:1px solic #cccccc;\">&amp;audit_filter_list[AUDIT_FILTER_EXIT])</code>.</p>\n\n<p>GCC&rsquo;s constant propagation pass makes use of these two things to, quite reasonably, const-propagate the third argument to the point of use.</p>\n\n<p>This causes the exit check in the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">list_for_each</code> loop to look like this:</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\naudit_filter_syscall.constprop.18(task, ctx):\n 0: 48 8b 1b mov (%rbx),%rbx\n 3: 48 81 fb e0 67 ac 82 cmp $0xffffffff82ac67e0,%rbx\n ffffffff8118b5ed: R_X86_64_32S audit_filter_list+0x40\n 10: 75 e2 jne start_iter</pre>\n\n<p>while, without const-propagation it would have looked like this:</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\naudit_filter_syscall(task, ctx, list):\n 0: 48 8b 1b mov (%rbx),%rbx\n 3: 4c 39 e3 cmp %r12,%rbx\n 6: 75 e6 jne start_iter</pre>\n\n<p>Now either one ought to be alright, both <code style=\"background:#eeeeee;border:1px solic #cccccc;\">cmp imm32,r</code> and <code style=\"background:#eeeeee;border:1px solic #cccccc;\">cmp r,r</code> forms are equivalent with a latency of 1 cycle, and both are a single micro-op each.</p>\n\n<p>The second form of the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">cmp</code>, however, can be macro-op fused with the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">jne</code>; not entirely sure if the first form can be[11]. The second form is also denser, though that&rsquo;s not a concern here.</p>\n\n<p>Disallowing GCC from making assumptions about calling contexts by removing the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">static</code> linkage from <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code> forces it to pass the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">list</code> parameter in a register which results in a small performance improvement: ~20 cycles (about 0.5 cycles/loop iteration.)</p>\n\n<p>Change in latency for UEK6:</p>\n\n<div class=\"divTable\">\n<div class=\"divTableHeading\">\n<div class=\"divTableCell\">Version</div>\n\n<div class=\"divTableCell\">Min<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Mean<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Median<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Max<br />\n(ns)</div>\n</div>\n\n<div class=\"divTableRow\">\n<div class=\"divTableCell\">ctx-&gt;major+annot</div>\n\n<div class=\"divTableCell\">165.26</div>\n\n<div class=\"divTableCell\">188.72</div>\n\n<div class=\"divTableCell\">184.25</div>\n\n<div class=\"divTableCell\">230.34</div>\n</div>\n\n<div class=\"divTableRow\">\n<div class=\"divTableCell\">ctx-&gt;major+annot+extern</div>\n\n<div class=\"divTableCell\">159.88</div>\n\n<div class=\"divTableCell\">184.35</div>\n\n<div class=\"divTableCell\">177.62</div>\n\n<div class=\"divTableCell\">250.82</div>\n</div>\n</div>\n\n<p>&nbsp;</p>\n\n<p>More details (<code style=\"background:#eeeeee;border:1px solic #cccccc;\">perf-stat</code> and the before/after versions of the generated code) in <a href=\"https://github.com/oracle/linux-uek/commit/5a74015e20bff63d1052359fbc2c3418e0f6bc4e\">UEK6 commit-3</a> and, <a href=\"https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/kernel/auditsc.c?id=50979953c0c41e929e5f955800da68e1bb24c7ab\">Upstream commit-3</a>.</p>\n\n<h2 id=\"summary\">Summary</h2>\n\n<p>The audit subystem is fairly stable in the Linux kernel, not given to frequent changes. So it was puzzling when it became slower in recent kernels, and because a primary user is the syscall path, concerning[12].</p>\n\n<p>The cause turned out to be higher skew in allocated buffers which results in more lopsided cache-set distribution.</p>\n\n<p>The fixes compensate for the higher costs in the loop by taking advantage of the peculiarities of the execution path and optimizing for the speculative nature of the CPU pipeline.</p>\n\n<p>The three patches, in sum reduce the overhead by about 30ns (~100 cycles).</p>\n\n<p>Final <code style=\"background:#eeeeee;border:1px solic #cccccc;\">perf stat -d -r 5</code> go from:</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n# perf stat -d -r 5 ./getpid\n# output normalized for a single getpid() call\n\ncycles 761.65 ( +- 5.22% )\ninstructions 1639.17 ( +- 0.00% )\nIPC 2.18 ( +- 5.50% )\nbranches 328.21 ( +- 0.00% )\nbranch-misses 1.37 ( +- 6.56% )\nL1-dcache-loads 404.35 ( +- 0.00% )\nL1-dcache-load-misses 7.99 ( +- 70.71% )</pre>\n\n<p>to:</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n# perf stat -d -r 5 ./getpid\n# output normalized for a single getpid() call\n\ncycles 669.09 ( +- 11.23% )\ninstructions 1342.04 ( +- 0.00% )\nIPC 2.03 ( +- 9.85% )\nbranches 328.19 ( +- 0.00% )\nbranch-misses 0.56 ( +- 5.35% )\nL1-dcache-loads 384.31 ( +- 0.00% )\nL1-dcache-load-misses 5.77 ( +- 84.57% )</pre>\n\n<p>This compares quite well to the UEK5-baseline:</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n# perf stat -d -r 5 ./getpid\n# output normalized for a single getpid() call\n\ncycles 672.90 ( +- 1.65% )\ninstructions 1622.08 ( +- 0.00% )\nIPC 2.41 ( +- 1.65% )\nbranches 321.20 ( +- 0.00% )\nbranch-misses 0.51 ( +- 0.00% )\nL1-dcache-loads 401.32 ( +- 0.00% )\nL1-dcache-load-misses 2.28 ( +- 59.62% )</pre>\n\n<p>Note for non-Skylake-X architectures: Intel Icelake and AMD Milan (the other architectures tested) cope with L1d-load-misses much better so the baseline performance is much better.</p>\n\n<p>With these patches, they only show a small improvement (~10ns): Icelake has a bigger L1d-cache (48K), and a much bigger ROB. Milan also has a bigger ROB and does memory renaming and bunch of other pipeline optimizations that limit the effect of these optimizations.</p>\n\n<p><strong>Endnote:</strong> what I found personally instructive was how much C really is &ldquo;a portable assembler&rdquo; and the significant codegen (and performance) changes that can result from minimal changes to the code.</p>\n\n<h2 id=\"references\">References</h2>\n\n<ol type=\"1\">\n\t<li>\n\t<p><code style=\"background:#eeeeee;border:1px solic #cccccc;\">getpid()</code> has a minimal kernel execution path (only does a PID lookup), and so is generally used to measure the overhead of the syscall path.</p>\n\t</li>\n\t<li>\n\t<p>Comparing the IPC for the audit-only portion shows that a starker drop:</p>\n\n\t<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\nUEK5: 1427.0 instructions # 3.41 insn per cycle\nUEK6: 1432.0 instructions # 2.84 insn per cycle</pre>\n\t</li>\n\t<li>\n\t<p>Alas no, alias analysis is an undecidable problem.</p>\n\t</li>\n\t<li>\n\t<p>Or for that matter, what causes the extra branch-miss.</p>\n\t</li>\n\t<li>\n\t<p>Another possibility is out-of-line code -- frequent interrupts, vmexits etc -- trashing the cache but from profiling these were a non-issue.</p>\n\t</li>\n\t<li>\n\t<p>Measured using the pearson-quotient(x, y): correlation coefficient between quantities x and y.</p>\n\t</li>\n\t<li>\n\t<p>You might notice that this analysis does not address the extra branch-miss. That&#39;s because I still have no clue what causes it.</p>\n\t</li>\n\t<li>\n\t<p>The correct fix would be to fix whatever ails the allocator. However, from a quick look at the changes that have gone into related code, it seems non-trivial to find a particular commit which points to the root cause of the skew (especially given that the skew is not constant, but varies from run-to-run.) Also, notably, the fixes described below also apply to UEK5, which means that even if UEK6 becomes faster, UEK5 will also improve somewhat.</p>\n\t</li>\n\t<li>\n\t<p>As mentioned in <a href=\"#cpu-parameters\">CPU-parameters</a>, L1d-loads take 4-6 cycles on Skylake-X. We also know that in the good case (UEK5), this loop is capable of an IPC of 3.41 insn per cycle. So, hiding L1d-load latency is critical for good performance.</p>\n\t</li>\n\t<li>\n\t<p><a href=\"https://www.agner.org/optimize/instruction_tables.pdf%3E\">https://www.agner.org/optimize/instruction_tables.pdf</a>, pg 298 (Skylake-X)</p>\n\t</li>\n\t<li>\n\t<p>The first form fused, needs three inputs: <code style=\"background:#eeeeee;border:1px solic #cccccc;\">%rbx</code>, an <code style=\"background:#eeeeee;border:1px solic #cccccc;\">imm32</code> encoding the distance to the address being compared, and an <code style=\"background:#eeeeee;border:1px solic #cccccc;\">imm8</code> encoding the distance to the branch-dest; the second needs two registers: <code style=\"background:#eeeeee;border:1px solic #cccccc;\">%rbx</code>, <code style=\"background:#eeeeee;border:1px solic #cccccc;\">%r12</code> and only the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">imm8</code>.</p>\n\t</li>\n\t<li>\n\t<p>Just for context, a kernel build (x86-defconfig) makes an aggregate of 27M syscalls, with a syscall every 44us.</p>\n\t</li>\n</ol>\n","translate":false,"og_description":"An in-depth exploration into why syscall latency increased on some x86 kernels in recent kernels. ","featuredpost":false,"audience":null,"product":null,"comments":true,"meta_title":"Syscall latency... and some uses of speculative execution","time_to_read":23,"author":[{"id":"COREF415334566DE45208D79D6CD6FA88629","type":"Blog-Author","typeCategory":"ContentType","links":[]}],"translated-pages":null,"canonical_url":null,"featured_image":{"id":"CONTCF8836A82B014903A5283C76DE901346","type":"DigitalAsset","typeCategory":"DigitalAssetType","links":[]},"og_image":{"id":"CONTCF8836A82B014903A5283C76DE901346","type":"DigitalAsset","typeCategory":"DigitalAssetType","links":[]},"meta_description":"An in-depth exploration into why syscall latency increased on some x86 kernels in recent kernels. ","meta_robots":"index, follow","primary_channel":"linux","globalhomepagefeaturedpost":false,"publish_date":{"value":"2023-09-12T15:00:00.000Z","timezone":"UTC","formated":"September 12, 2023"},"desc":"An in-depth exploration into why syscall latency increased on some x86 kernels in recent kernels. ","author_id":"CORE8B88E20204C04A0DADCEBC0499683C49","categories":[{"category":"Technologies","pageUrl":"../category/lnx-technologies"},{"category":"Linux Kernel Development","pageUrl":"../category/lnx-linux-kernel-development"}]}}"></div> 1248 <!-- --> 1249 <script> 1250 window.SCSMacros = window.SCSMacros || {}; 1251 var pubdate = document.getElementById("pubdate").value; 1252 window.SCSMacros.getCreationdateMacro = pubdate; 1253 1254 1255 </script> 1256 <script> 1257 if (SCS && SCS['siteId'] === "Blogs-Home") { 1258 var Primarychannel = document.getElementById("primarychannel").value; 1259 if(Primarychannel != 'undefined' && Primarychannel != 'Blogs-Home'){ 1260 var slug = window.location.pathname.split('/')[2]; 1261 var domain = window.location.origin; 1262 if(domain === "https://blogs.oracle.com"){ 1263 window.location.replace("https://blogs.oracle.com/"+Primarychannel+"/post/"+slug); 1264 }else{ 1265 window.location.replace("https://blogs-stage.oracle.com/"+Primarychannel+"/post/"+slug); 1266 } 1267 } 1268 } 1269 </script> 1270 1271 <script type="text/javascript"> 1272 /*! ORACLE - TRACKING URL */ 1273 $(document).ready(function() { 1274 const oracleLinks = document.querySelectorAll('a[href*="go.oracle.com"][data-trackas]'); 1275 1276 oracleLinks.forEach(link => { 1277 const url = new URL(link.href); 1278 1279 if (!url.searchParams.has('source') && !url.searchParams.has('src1')) { 1280 const currentUrlParams = new URLSearchParams(window.location.search); 1281 const existingUrlParams = new URLSearchParams(url.search); 1282 1283 if (currentUrlParams.has('source')) { 1284 currentUrlParams.set('src1', currentUrlParams.get('source')); 1285 currentUrlParams.delete('source'); 1286 } 1287 1288 let mergedParams = existingUrlParams.toString(); 1289 if (mergedParams) { 1290 mergedParams += '&'; 1291 } else { 1292 mergedParams += '?'; 1293 } 1294 mergedParams += currentUrlParams.toString(); 1295 1296 url.search = mergedParams; 1297 1298 link.href = decodeURIComponent(url.toString()); 1299 1300 } 1301 }); 1302 }); 1303 </script> 1304 1305 </div> 1306 </div> 1307 </div> 1308 </div> 1309 </div> 1310 </div></div></div></div></div> 1311 <!-- <div class="scs-slot" id="category-id"></div> --> 1312 <!-- RH03v5 --> 1313 <div id="Next-Previous-Posts" class="scs-slot"><div class="scs-row"><div class="scs-col" style="width: 100%;"><div id="a057a3dc-2397-4b35-88dc-e9904a3f1789"><div class="scs-component-bounding-box"><!-- --> 1314 <div> 1315 <div class="scs-custom-component scs-component scs-component-default-style" style="margin-top:5px;margin-right:5px;margin-bottom:5px;margin-left:5px;"> 1316 <div class="scs-component-content" style="width:100%;"> 1317 <div style="" class="scs-custom-component-wrapper"> 1318 <div id="a057a3dc-2397-4b35-88dc-e9904a3f1789customComponentDiv" data-scs-hydrate="true" data-scs-contenttype="Blog-Post" data-asset-operation="view:CORE8B88E20204C04A0DADCEBC0499683C49"> 1319 <section class="rc83 rc83v0 rw-neutral-00bg cpad xwidth"> 1320 <div class="rc83w1 cwidth"> 1321 <div class="rc83pagenav"> 1322 <div class="rc83nav-lt"> 1323 <a href="oracle-linux-automation-manager-21" class="rc83arrow-lt"> 1324 <div class="icn-img icn-chevron-left"><br></div> 1325 <p id="PreviousPostText">Previous Post</p> 1326 </a> 1327 <h4>Discover the Latest Advancements in Automation with Oracle Linux Automation Manager 2.1</h4> 1328 <div class="rc83sub"> 1329 <span><a href="/authors/monica-s">Monica S</a> | </span><span>3</span><span> min read</span> 1330 </div> 1331 </div> 1332 <div class="rc83nav-rt"> 1333 <a href="get-inspired-at-oracle-cloudworld-2023hear-from-customers-technical-industry-experts-and-executives-and-get-your-questions-answered" class="rc83arrow-rt"> 1334 <p id="NextPostText">Next Post</p> 1335 <div class="icn-img icn-chevron-right"><br></div> 1336 </a> 1337 <h4>Get inspired at Oracle CloudWorld 2023—hear from customers, technical industry experts, and executives and get your questions answered</h4> 1338 <div class="rc83sub"> 1339 <span><a href="/authors/michele-resta">Michele Resta</a> | </span><span>3</span><span> min read</span> 1340 </div> 1341 </div> 1342 </div> 1343 </div> 1344 </section> 1345 1346 </div> 1347 </div> 1348 </div> 1349 </div> 1350 </div> 1351 </div></div></div></div></div> 1352 <!-- <div class="scs-slot" id="recent-posts"></div> --> 1353 <!-- U10v6 --> 1354 <div class="u10 u10v6" data-trackas="ffooter" data-ocomid="redwood"> 1355 1356 <div class="u10w1"> 1357 1358 <div class="u10w2"> 1359 <div class="u10w3"> 1360 <h5>Resources for</h5> 1361 <ul> 1362 <li><a href="https://www.oracle.com/corporate/">About</a></li> 1363 <li><a href="https://www.oracle.com/corporate/careers/" 1364 data-lbl="about-oracle:careers">Careers</a></li> 1365 <li><a href="https://developer.oracle.com">Developers</a></li> 1366 <li><a href="https://investor.oracle.com/home/default.aspx">Investors</a></li> 1367 <li><a href="https://www.oracle.com/partner/">Partners</a></li> 1368 <li><a href="https://www.oracle.com/startup/">Startups</a></li> 1369 </ul> 1370 </div> 1371 </div> 1372 <div class="u10w2"> 1373 <div class="u10w3"> 1374 <h5>Why Oracle</h5> 1375 <ul> 1376 <li><a href="https://www.oracle.com/corporate/analyst-reports.html">Analyst Reports</a></li> 1377 <li><a href="https://www.oracle.com/cx/what-is-crm/ ">Best CRM</a></li> 1378 <li><a href="https://www.oracle.com/cloud/economics/">Cloud Economics</a></li> 1379 <li><a href="https://www.oracle.com/corporate/citizenship/">Corporate Responsibility</a> 1380 </li> 1381 <li><a href="https://www.oracle.com/corporate/careers/diversity-inclusion/">Diversity and 1382 Inclusion</a></li> 1383 <li><a href="https://www.oracle.com/corporate/security-practices/">Security Practices</a> 1384 </li> 1385 </ul> 1386 </div> 1387 </div> 1388 <div class="u10w2"> 1389 <div class="u10w3"> 1390 <h5>Learn</h5> 1391 <ul> 1392 <li><a href="https://www.oracle.com/cx/service/what-is-customer-service/ ">What is Customer 1393 Service?</a></li> 1394 <li><a href=" https://www.oracle.com/erp/what-is-erp/">What is ERP?</a></li> 1395 <li><a 1396 href=" https://www.oracle.com/cx/marketing/automation/what-is-marketing-automation/ ">What 1397 is Marketing Automation?</a></li> 1398 <li><a href="https://www.oracle.com/erp/what-is-procurement/ ">What is Procurement?</a></li> 1399 <li><a 1400 href="https://www.oracle.com/human-capital-management/talent-management/what-is-talent-management/ ">What 1401 is Talent Management?</a></li> 1402 <li><a 1403 href=" https://www.oracle.com/cloud/compute/virtual-machines/what-is-virtual-machine/ ">What 1404 is VM?</a></li> 1405 </ul> 1406 </div> 1407 </div> 1408 <div class="u10w2"> 1409 <div class="u10w3"> 1410 <h5>What's New</h5> 1411 <ul> 1412 <li><a 1413 href="https://www.oracle.com/cloud/free/?source=:ow:o:h:nav:050120SiteFooter&intcmp=:ow:o:h:nav:050120SiteFooter">Try 1414 Oracle Cloud Free Tier</a></li> 1415 <li><a href="https://www.oracle.com/solutions/green/">Oracle Sustainability</a></li> 1416 <li><a href="https://www.oracle.com/corporate/covid-19.html ">Oracle COVID-19 Response</a> 1417 </li> 1418 <li><a href="https://www.oracle.com/sailgp/">Oracle and SailGP</a></li> 1419 <li><a href="https://www.oracle.com/premier-league/">Oracle and Premier League</a></li> 1420 <li><a href="https://www.oracle.com/redbullracing/">Oracle and Red Bull Racing Honda</a> 1421 </li> 1422 </ul> 1423 </div> 1424 </div> 1425 <div class="u10w2"> 1426 <div class="u10w3"> 1427 <h5>Contact Us</h5> 1428 <ul> 1429 <li><a href="tel:18006330738">US Sales 1.800.633.0738</a></li> 1430 <li><a href="https://www.oracle.com/corporate/contact/">How can we help?</a></li> 1431 <li><a href="https://go.oracle.com/subscriptions">Subscribe to Oracle Content</a></li> 1432 <li><a 1433 href="https://www.oracle.com/cloud/free/?source=:ow:o:h:nav:050120SiteFooter&intcmp=:ow:o:h:nav:050120SiteFooter">Try 1434 Oracle Cloud Free Tier</a></li> 1435 <li><a href="https://www.oracle.com/events/">Events</a></li> 1436 <li><a href="https://www.oracle.com/news/" data-lbl="news-events:newsroom">News</a></li> 1437 </ul> 1438 </div> 1439 </div> 1440 <div class="u10w4"> 1441 <hr /> 1442 </div> 1443 1444 <div class="u10w5 "> 1445 <ul class="u10-links u10w10"> 1446 <li><a href="https://www.oracle.com/legal/copyright.html" data-lbl="copyright">© 2022 Oracle</a></li> 1447 1448 <li><a data-lbl="privacy" 1449 href="https://www.oracle.com/legal/privacy/">Privacy</a><span>/</span><a 1450 data-lbl="do-not-sell-my-info" 1451 href="https://www.oracle.com/legal/privacy/privacy-choices.html">Do Not Sell My Info</a> 1452 </li> 1453 <li> 1454 <div id='teconsent'> </div> 1455 </li> 1456 <li><a href="https://www.oracle.com/legal/privacy/privacy-policy.html#advertising" 1457 data-lbl="ad-choices">Ad Choices</a></li> 1458 <li><a href="https://www.oracle.com/corporate/careers/" data-lbl="careers">Careers</a></li> 1459 </ul> 1460 1461 </div> 1462 </div> 1463 1464 </div> 1465 <!-- /U10v6 --> 1466 1467 1468 </div> 1469 <script type="text/javascript" src="https://www.oracle.com/us/assets/metrics/ora_compendiumblogs.js"></script> 1470 <script type="text/javascript" src="https://www.oracle.com/assets/truste-oraclelib.js"></script> 1471 <script async="async" type="text/javascript" 1472 src="//consent.trustarc.com/notice?domain=oracle.com&c=teconsent&js=bb¬iceType=bb&text=true>m=1&cdn=1&pcookie" 1473 crossorigin=""></script> 1474 <script type="text/javascript" src="../_cache_8b25/siteinfo-common.js" charset="utf-8"></script><script type="text/javascript" src="../siteinfo-dynamic.js"></script> 1475 <script src="../_cache_8b25/_sitesclouddelivery/renderer/renderer.js"></script> 1476 1477 1478 1479 1480 <script src="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/js/blogs-script.js"></script> 1481 <script src="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/js/oracle-script.js"></script> 1482 1483 <!-- Avoid FOUC issue in FF with async loading of style sheets --> 1484 <style> 1485 body { 1486 opacity: 1; 1487 } 1488 </style> 1489 1490 <script type="text/javascript"> 1491 $(document).ready(function () { 1492 $('a[data-lbl="copyright"]').html("© " + new Date().getFullYear() +" Oracle " ); 1493 1494 }); 1495 </script> 1496 <!--DTM/Launch embed code - Footer --> 1497 1498 </body> 1499 1500 </html>