blogs.oracle.com.html - webdump_tests - Testfiles for webdump
(HTM) git clone git://git.codemadness.org/webdump_tests
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
---
blogs.oracle.com.html (134476B)
---
1 <!DOCTYPE html>
2 <html lang="en-US" class="no-js">
3
4 <head>
5 <!-- Avoid FOUC issue in FF with async loading of style sheets -->
6 <style>
7 body {
8 opacity: 1;
9 }
10 </style>
11 <title>Syscall latency... and some uses of speculative execution</title>
12 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
13 <meta charset="utf-8">
14 <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
15 <link rel="alternate" type="application/rss+xml" href="https://blogs.oracle.com/rss">
16 <!-- $meta_tags -->
17 <meta name="country" content="">
18 <meta name="contenttype_id" content="WM147046">
19 <meta name="description" content="An in-depth exploration into why syscall latency increased on some x86 kernels in recent kernels. ">
20 <link rel="canonical" href="">
21 <meta name="host_name" content="https://blogs.oracle.com">
22 <meta name="title" content="Syscall latency... and some uses of speculative execution">
23 <meta name="blog_name" content="Oracle Linux Blog">
24 <meta name="author" content="Ankur">
25 <meta name="keywords" content="Technologies,Linux Kernel Development">
26 <meta name="publish_date" content="September 12, 2023">
27 <meta name="siteid" content="us">
28 <meta name="Language" content="">
29 <meta name="robots" content="index, follow">
30 <meta name="audience" content="">
31 <meta name="product" content="">
32 <meta property="og:type" content="blog">
33 <meta property="og:title" content="Syscall latency... and some uses of speculative execution">
34 <meta property="og:image" content="https://blogs.oracle.com/content/published/api/v1.1/assets/CONTCF8836A82B014903A5283C76DE901346/Medium?format=jpg&channelToken=3189ef66cf584820b5b19e6b10792d6f">
35 <meta property="og:description" content="An in-depth exploration into why syscall latency increased on some x86 kernels in recent kernels. ">
36 <meta property="og:url" content="https://blogs.oracle.com/linux/post/syscall-latency">
37 <meta name="category" content="Technologies,Linux Kernel Development">
38 <meta name="twitter:card" content="summary_large_image">
39 <meta name="twitter:title" content="" />
40 <meta name="twitter:description" content="" />
41 <meta name="twitter:image" content="" />
42 <meta name="google-site-verification" content="OVRFC0CuVBZNzlfzelWzFIN7D4gCrVfzsfmMWvteKHs" />
43 <link rel="alternate" type="application/rss+xml" class="rss-link" title="Oracle Blogs"
44 href="https://blogs.oracle.com/rss">
45
46
47 <!-- <link data-wscss href="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/css/blogs-style.css" rel="preload" as="style" onload="this.rel='stylesheet';" onerror="this.rel='stylesheet'"> -->
48 <link rel="preload" onload="this.rel='stylesheet'" href="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/css/blogs-style.css" as="style" />
49 <link rel="preload" onload="this.rel='stylesheet'" href="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/css/oracle-style.css" as="style" />
50
51 <!-- favicon -->
52 <link rel="icon" href="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/img/logo.ico" type="image/x-icon" />
53 <link rel="shortcut icon" href="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/img/logo.ico" type="image/x-icon" />
54
55 <link rel="preconnect" href="https://c.go-mpulse.net" crossorigin />
56 <link rel="preconnect" href="https://s.go-mpulse.net" crossorigin />
57 <link rel="preconnect" href="https://tms.oracle.com" crossorigin />
58 <link rel="preconnect" href="https://www.facebook.com" crossorigin />
59 <link rel="preconnect" href="https://connect.facebook.net" crossorigin />
60 <link rel="preconnect" href="https://www.oracle.com" crossorigin />
61 <link rel="preconnect" href="https://consent.trustarc.com" crossorigin />
62 <link rel="preconnect" href="https://www.oracleimg.com" crossorigin />
63 <link rel="preconnect" href="https://oracle.112.2o7.net" crossorigin />
64 <link rel="preconnect" href="https://trial-eum-clientnsv4-s.akamaihd.net" crossorigin />
65 <link rel="preconnect" href="https://trial-eum-clienttons-s.akamaihd.net" crossorigin />
66 <link rel="preconnect" href="https://d.oracleinfinity.io" crossorigin />
67 <link rel="preconnect" href="https://www.googletagmanager.com" crossorigin />
68
69 <link rel="dns-prefetch" href="https://static.ocecdn.oraclecloud.com" />
70 <link rel="dns-prefetch" href="https://c.go-mpulse.net" />
71 <link rel="dns-prefetch" href="https://s.go-mpulse.net" />
72 <link rel="dns-prefetch" href="https://tms.oracle.com" />
73 <link rel="dns-prefetch" href="https://www.facebook.com" />
74 <link rel="dns-prefetch" href="https://connect.facebook.net" />
75 <link rel="dns-prefetch" href="https://www.oracle.com" />
76 <link rel="dns-prefetch" href="https://consent.trustarc.com" />
77 <link rel="dns-prefetch" href="https://www.oracleimg.com" />
78 <link rel="dns-prefetch" href="https://oracle.112.2o7.net" />
79 <link rel="dns-prefetch" href="[https://trial-eum-clientnsv4-s.akamaihd.net" />
80 <link rel="dns-prefetch" href="https://trial-eum-clienttons-s.akamaihd.net" />
81 <link rel="dns-prefetch" href="https://d.oracleinfinity.io" />
82 <link rel="dns-prefetch" href="https://www.googletagmanager.com" />
83
84 <script type="text/javascript" src="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/js/jquery/jquery-min.js"
85 onload="$('head link[data-reqjq][rel=preload]').each(function(){var a = document.createElement('script');a.async=false;a.src=$(this).attr('href');this.parentNode.insertBefore(a, this);});$(function(){$('script[data-reqjq][data-src]').each(function(){this.async=true;this.src=$(this).data('src');});});"></script>
86
87 <!-- <script type="text/javascript" src="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/js/blogs-script.js"></script> -->
88
89
90 <!--<link data-wsjs data-reqjq href="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/js/blogs-script.js" rel="preload" as="script">->
91
92 <script type="text/javascript" src="http://webstandards.us.oracle.com:9292/global_assets_v22.9.3/assets/js/redwood-blogs.js"></script> -->
93
94
95 <script id="scsRenderInfo" type="application/json">{"sitePrefix":"../","pageModel":{"properties":{"title":"Blog Theme - Details","pageLayout":"post-detail.html","mobileLayout":"","pageDescription":"","keywords":"","hideFromSearchEngines":false,"styles":[],"header":" ","footer":"","noIndex":false,"noFollow":false,"noArchive":false,"noSnippet":false,"isCobrowseEnabled":false,"overrideWebAnalytics":false,"webAnalyticsScript":null},"slots":{"post-id":{"components":["a37b49d5-e11f-4e1f-a5e0-fd37af71a288"],"grid":"<div class=\"scs-row\"><div class=\"scs-col\" style=\"width: 100%;\"><div id=\"a37b49d5-e11f-4e1f-a5e0-fd37af71a288\"></div></div></div>","preRenderedByController":true},"blog-search1":{"components":[],"grid":""},"homepage-banner":{"components":["c30bb2b5-2186-4cd6-aeb8-2f23c0d9360c"],"grid":"<div class=\"scs-row\"><div class=\"scs-col\" style=\"width: 100%;\"><div id=\"c30bb2b5-2186-4cd6-aeb8-2f23c0d9360c\"></div></div></div>"},"search":{"components":[],"grid":""},"recent-posts":{"components":[],"grid":""},"category-id":{"components":[],"grid":""},"blogs-category-nav":{"components":["f62eb3cd-6ac8-407d-9e06-69cbbc8d821e"],"grid":"<div class=\"scs-row\"><div class=\"scs-col\" style=\"width: 100%;\"><div id=\"f62eb3cd-6ac8-407d-9e06-69cbbc8d821e\"></div></div></div>","preRenderedByController":true},"Next-Previous-Posts":{"components":["a057a3dc-2397-4b35-88dc-e9904a3f1789"],"grid":"<div class=\"scs-row\"><div class=\"scs-col\" style=\"width: 100%;\"><div id=\"a057a3dc-2397-4b35-88dc-e9904a3f1789\"></div></div></div>","preRenderedByController":true}},"componentInstances":{"c30bb2b5-2186-4cd6-aeb8-2f23c0d9360c":{"type":"scs-component","id":"Blogs-Email-Subscription","data":{"actions":"","alignment":"fill","assets":"","borderColor":"#808080","borderRadius":0,"borderStyle":"none","borderWidth":1,"componentId":"Blogs-Email-Subscription","componentName":"Blogs-Email-Subscription","componentFactory":"","componentLayout":"default","contentId":"","contentLayoutCategory":"","contentPlaceholder":false,"contentTypes":[],"contentViewing":"","customRenderComplete":false,"customSettingsData":{},"componentConfig":{"id":"Blogs-Email-Subscription","settingsData":{"settingsHeight":0,"settingsWidth":0,"settingsRenderOption":"none","componentLayouts":[],"styles":[{"name":"Medium Green","class":"Ora-Paragraph-Banner-default-style"},{"name":"Dark Green","class":"ora-paragraph-banner-dark-green"},{"name":"Orange","class":"ora-paragraph-banner-orange"},{"name":"Teal","class":"ora-paragraph-banner-teal"},{"name":"Medium Teal","class":"ora-paragraph-banner-medium-teal"},{"name":"Blue","class":"ora-paragraph-banner-blue"},{"name":"Medium Brown","class":"ora-paragraph-banner-medium-brown"},{"name":"Granite","class":"ora-paragraph-banner-granite"},{"name":"Ecru","class":"ora-paragraph-banner-ecru"},{"name":"Fog Blue","class":"ora-paragraph-banner-fog-blue"},{"name":"Yellow","class":"ora-paragraph-banner-yellow"},{"name":"Dark Brown","class":"ora-paragraph-banner-dark-brown"}],"triggers":[],"actions":[]}},"description":"","detailPageId":"","height":"","initialized":true,"isCaaSLayout":false,"linkType":"scs-link-action","marginBottom":0,"marginLeft":0,"marginRight":0,"marginTop":0,"nestedComponents":[{"id":"oraParagraphBannerCtaText","type":"scs-button","data":{"marginBottom":0,"marginLeft":0,"marginRight":0,"marginTop":0,"styleClass":"scs-button-secondary-style","useStyleClass":"true","visible":true,"width":0}}],"renderOnAccess":"","styleClass":"","styleClassName":"","seeded":false,"useStyleClass":"true","visible":true,"visibleOnMobile":"","visibleNestedComponents":[],"width":0}},"a37b49d5-e11f-4e1f-a5e0-fd37af71a288":{"type":"scs-component","id":"scs-contentplaceholder","data":{"actions":"","alignment":"fill","assets":"","borderColor":"#808080","borderRadius":0,"borderStyle":"none","borderWidth":1,"componentId":"","componentName":"","componentFactory":"","componentLayout":"","contentId":"","contentLayoutCategory":"Blogs-Redwood-Post-Detail-Layout","contentPlaceholder":true,"contentTypes":["Blog-Post","Syndicated-Blog-Post"],"contentTypeDisplayName":"","contentTypeCategory":"ContentType","contentViewing":"","customRenderComplete":false,"customSettingsData":"","componentConfig":"","description":"","detailPageId":"105","height":"","initialized":true,"isCaaSLayout":true,"linkType":"scs-link-action","marginBottom":5,"marginLeft":5,"marginRight":5,"marginTop":5,"nestedComponents":[],"renderOnAccess":"","styleClass":"","styleClassName":"","seeded":true,"useStyleClass":"true","visible":true,"visibleOnMobile":"","visibleNestedComponents":[],"width":0},"preRenderedByController":true},"f62eb3cd-6ac8-407d-9e06-69cbbc8d821e":{"type":"scs-component","id":"Blogs-Redwood-Category-Nav","data":{"actions":"","alignment":"fill","assets":"","borderColor":"#808080","borderRadius":0,"borderStyle":"none","borderWidth":1,"componentId":"sampleComp","componentName":"Blogs-Redwood-Category-Nav","componentFactory":"","componentLayout":"default","contentId":"","contentLayoutCategory":"","contentPlaceholder":false,"contentTypes":[],"contentTypeCategory":"ContentType","contentViewing":"","customRenderComplete":false,"customSettingsData":{"taxonomyId":"6BC2FEFCC79B49D7A757708E6D9BE2CB"},"componentConfig":{"id":"sample-component","settingsData":{"settingsHeight":240,"settingsWidth":300,"settingsRenderOption":"dialog","componentLayouts":[{"name":"default","displayName":"IMAGE_LEFT_LAYOUT"},{"name":"right","displayName":"IMAGE_RIGHT_LAYOUT"},{"name":"top","displayName":"IMAGE_TOP_LAYOUT"}],"styles":[{"name":"Bold","class":"sample-component-bold-style"},{"name":"Italic","class":"sample-component-italic-style"}],"triggers":[{"triggerName":"imageClicked","triggerDescription":"Image clicked","triggerPayload":[{"name":"payloadData","displayName":"Trigger Payload Data"}]}],"actions":[{"actionName":"setImageWidth","actionDescription":"Update the image width","actionPayload":[{"name":"imageWidth","description":"Image Width in pixels","type":{"ojComponent":{"component":"ojInputText"}},"value":""}]}]}},"description":"","detailPageId":"","height":"","initialized":true,"isCaaSLayout":false,"linkType":"scs-link-action","marginBottom":5,"marginLeft":5,"marginRight":5,"marginTop":5,"nestedComponents":[{"id":"imageId","type":"scs-image","data":{"imageUrl":"[!--$SCS_DIST_FOLDER--]/renderer/app/sdk/images/sample-image.png","marginBottom":0,"marginLeft":0,"marginRight":20,"marginTop":0}},{"id":"titleId","type":"scs-title","data":{"userText":"<div>Local Component</div>","fontColor":"#000000","fontFamily":"'Helvetica Neue Light', Helvetica, Arial, sans-serif","fontSize":20,"marginBottom":14,"marginLeft":0,"marginRight":0,"marginTop":0,"useStyleClass":"false"}},{"id":"paragraphId","type":"scs-paragraph","data":{"userText":"<p style=\"line-height:1.4em;margin-bottom:4px;\">As a page author, you can edit the content and settings for this component. To change settings, including triggers and actions, click the component menu and choose Settings.</p><p style=\"line-height:1.4em;\">As a component developer, you can change the component and its defaults (such as this text) either by working with the files directly through the components browser or by using the desktop app to work with the files on your local system.</p>","fontColor":"#333333","fontFamily":"'Helvetica Neue Regular', Helvetica, Arial, sans-serif","fontSize":14,"marginBottom":10,"marginLeft":0,"marginRight":0,"marginTop":0,"useStyleClass":"false"}}],"renderOnAccess":"","styleClass":"","styleClassName":"","seeded":true,"useStyleClass":"true","visible":true,"visibleOnMobile":"","visibleNestedComponents":[],"width":0},"preRenderedByController":true},"a057a3dc-2397-4b35-88dc-e9904a3f1789":{"type":"scs-component","id":"scs-contentplaceholder","data":{"actions":"","alignment":"fill","assets":"","borderColor":"#808080","borderRadius":0,"borderStyle":"none","borderWidth":1,"componentId":"","componentName":"","componentFactory":"","componentLayout":"","contentId":"","contentLayoutCategory":"Blogs-Redwood-Next-Prev-Post","contentPlaceholder":true,"contentTypes":["Blog-Post","Syndicated-Blog-Post"],"contentTypeDisplayName":"","contentTypeCategory":"ContentType","contentViewing":"","customRenderComplete":false,"customSettingsData":"","componentConfig":"","description":"","detailPageId":"105","height":"","initialized":true,"isCaaSLayout":true,"linkType":"scs-link-action","marginBottom":5,"marginLeft":5,"marginRight":5,"marginTop":5,"nestedComponents":[],"renderOnAccess":"","styleClass":"","styleClassName":"","seeded":true,"useStyleClass":"true","visible":true,"visibleOnMobile":"","visibleNestedComponents":[],"width":0},"preRenderedByController":true}}},"navigationCurr":105}</script>
96 <script id="scsRenderObject" type="text/javascript">var require = {waitSeconds: 0};</script>
97
98
99
100 <script type="text/javascript">
101 window.SCSMacros = window.SCSMacros || {};
102 var url = window.location.href.split('?')[0];
103 var slug = url.substring(url.lastIndexOf('/') + 1);
104 //var slug = window.location.href.substring(window.location.href.lastIndexOf('/') + 1);
105 window.SCSMacros.getSlugMacro = slug;
106 </script>
107
108
109 <script type="text/javascript">
110 function expand() {
111 var x = document.getElementsByClassName("u03-collapsed");
112 var y = document.getElementsByClassName("u03-expanded");
113 for (var i = 0; i < x.length; i += 1) {
114 x[i].style.display = 'none';
115 }
116 for (var i = 0; i < y.length; i += 1) {
117 y[i].style.display = 'inline';
118 }
119 }
120
121 function collapse() {
122 var x = document.getElementsByClassName("u03-collapsed");
123 var y = document.getElementsByClassName("u03-expanded");
124 for (var i = 0; i < x.length; i += 1) {
125 x[i].style.display = 'inline';
126 }
127 for (var i = 0; i < y.length; i += 1) {
128 y[i].style.display = 'none';
129 }
130 }
131 </script>
132 <!--DTM/Launch embed code - Header -->
133
134
135 <script>!function(e){var n="https://s.go-mpulse.net/boomerang/";if("False"=="True")e.BOOMR_config=e.BOOMR_config||{},e.BOOMR_config.PageParams=e.BOOMR_config.PageParams||{},e.BOOMR_config.PageParams.pci=!0,n="https://s2.go-mpulse.net/boomerang/";if(window.BOOMR_API_key="G52AM-AGLAF-9JTSA-TBAP5-PCJJE",function(){function e(){if(!o){var e=document.createElement("script");e.id="boomr-scr-as",e.src=window.BOOMR.url,e.async=!0,i.parentNode.appendChild(e),o=!0}}function t(e){o=!0;var n,t,a,r,d=document,O=window;if(window.BOOMR.snippetMethod=e?"if":"i",t=function(e,n){var t=d.createElement("script");t.id=n||"boomr-if-as",t.src=window.BOOMR.url,BOOMR_lstart=(new Date).getTime(),e=e||d.body,e.appendChild(t)},!window.addEventListener&&window.attachEvent&&navigator.userAgent.match(/MSIE [67]\./))return window.BOOMR.snippetMethod="s",void t(i.parentNode,"boomr-async");a=document.createElement("IFRAME"),a.src="about:blank",a.title="",a.role="presentation",a.loading="eager",r=(a.frameElement||a).style,r.width=0,r.height=0,r.border=0,r.display="none",i.parentNode.appendChild(a);try{O=a.contentWindow,d=O.document.open()}catch(_){n=document.domain,a.src="javascript:var d=document.open();d.domain='"+n+"';void(0);",O=a.contentWindow,d=O.document.open()}if(n)d._boomrl=function(){this.domain=n,t()},d.write("<bo"+"dy onload='document._boomrl();'>");else if(O._boomrl=function(){t()},O.addEventListener)O.addEventListener("load",O._boomrl,!1);else if(O.attachEvent)O.attachEvent("onload",O._boomrl);d.close()}function a(e){window.BOOMR_onload=e&&e.timeStamp||(new Date).getTime()}if(!window.BOOMR||!window.BOOMR.version&&!window.BOOMR.snippetExecuted){window.BOOMR=window.BOOMR||{},window.BOOMR.snippetStart=(new Date).getTime(),window.BOOMR.snippetExecuted=!0,window.BOOMR.snippetVersion=12,window.BOOMR.url=n+"G52AM-AGLAF-9JTSA-TBAP5-PCJJE";var i=document.currentScript||document.getElementsByTagName("script")[0],o=!1,r=document.createElement("link");if(r.relList&&"function"==typeof r.relList.supports&&r.relList.supports("preload")&&"as"in r)window.BOOMR.snippetMethod="p",r.href=window.BOOMR.url,r.rel="preload",r.as="script",r.addEventListener("load",e),r.addEventListener("error",function(){t(!0)}),setTimeout(function(){if(!o)t(!0)},3e3),BOOMR_lstart=(new Date).getTime(),i.parentNode.appendChild(r);else t(!1);if(window.addEventListener)window.addEventListener("load",a,!1);else if(window.attachEvent)window.attachEvent("onload",a)}}(),"".length>0)if(e&&"performance"in e&&e.performance&&"function"==typeof e.performance.setResourceTimingBufferSize)e.performance.setResourceTimingBufferSize();!function(){if(BOOMR=e.BOOMR||{},BOOMR.plugins=BOOMR.plugins||{},!BOOMR.plugins.AK){var n=""=="true"?1:0,t="",a="jwstvjqx2o5kqziiqb3q-f-48999fc27-clientnsv4-s.akamaihd.net",i="false"=="true"?2:1,o={"ak.v":"36","ak.cp":"87563","ak.ai":parseInt("165106",10),"ak.ol":"0","ak.cr":11,"ak.ipv":4,"ak.proto":"h2","ak.rid":"208b9d63","ak.r":43514,"ak.a2":n,"ak.m":"dscx","ak.n":"essl","ak.bpcip":"77.165.58.0","ak.cport":51967,"ak.gh":"23.209.124.154","ak.quicv":"","ak.tlsv":"tls1.3","ak.0rtt":"","ak.csrc":"-","ak.acc":"","ak.t":"1695055991","ak.ak":"hOBiQwZUYzCg5VSAfCLimQ==vTvvCcS7yUvMFUwhArQjprHmS4SyRkG4kcqkubHf0SBAfLCipu8Z3GPJ9e1GyMuPUYO3XEA5R7RpN9uJTq4BkLQbssemRdXUWVprB4rKDAEYqcmRCULA0ABeQ3qfODyP2aGzM6krlmzdUN3sDRsfq+Nqtt/d3qfpf8l9Q/pZ/isYuJ22cZkbqcxVDrtC3ZfyKoGxC4nBvQYyr+3eRkNd8Mn4h1+thrx2qiKO5Edi+DH9ERMG1glOjsIjPS83+W6/oMyxNe216KPSO85XtLtxCjStIG+EsWHRDNn8MY7U1+NNOw66FTYI/LJSo1dChisD40fj1vqVfOoGWFXZRyiJ2eXHK16Azfupm2/vPIH7sRubbBznZp4fKYBBPzIHbMC7CN7dAPBdYhG7M0KpFWkrEWxSbdA9McNb2A+jwoaq3mo=","ak.pv":"262","ak.dpoabenc":"","ak.tf":i};if(""!==t)o["ak.ruds"]=t;var r={i:!1,av:function(n){var t="http.initiator";if(n&&(!n[t]||"spa_hard"===n[t]))o["ak.feo"]=void 0!==e.aFeoApplied?1:0,BOOMR.addVar(o)},rv:function(){var e=["ak.bpcip","ak.cport","ak.cr","ak.csrc","ak.gh","ak.ipv","ak.m","ak.n","ak.ol","ak.proto","ak.quicv","ak.tlsv","ak.0rtt","ak.r","ak.acc","ak.t","ak.tf"];BOOMR.removeVar(e)}};BOOMR.plugins.AK={akVars:o,akDNSPreFetchDomain:a,init:function(){if(!r.i){var e=BOOMR.subscribe;e("before_beacon",r.av,null,null),e("onbeacon",r.rv,null,null),r.i=!0}return this},is_complete:function(){return!0}}}}()}(window);</script></head>
136
137 <body class="f20 f20v1" style="opacity:0">
138 <script src="https://tms.oracle.com/main/prod/utag.sync.js"></script>
139
140 <!-- Loading script asynchronously -->
141 <script type="text/javascript">
142 (function (a, b, c, d) {
143 if (location.href.indexOf("tealium=dev") == -1) {
144 a = 'https://tms.oracle.com/main/prod/utag.js';
145 } else {
146 a = 'https://tms.oracle.com/main/dev/utag.js';
147 }
148 b = document; c = 'script'; d = b.createElement(c); d.src = a; d.type = 'text/java' + c; d.async = true;
149 a = b.getElementsByTagName(c)[0]; a.parentNode.insertBefore(d, a);
150 })();
151 </script>
152 <div class="f20w1">
153
154
155 <!-- U18v2 -->
156 <div class="u18 u18v2">
157
158 <div id="u18skip2content">
159 <ul>
160 <li><a id="u18skip2c" href="#maincontent">Skip to content</a></li>
161 <li><a id="u18acc" href="https://www.oracle.com/corporate/accessibility/">Accessibility Policy</a></li>
162 </ul>
163 </div>
164
165 <nav role="banner">
166 <div class="u18w1 cwidth">
167
168 <div class="u18w2">
169 <div class="u18-logo"><a href="https://blogs.oracle.com"><span>Oracle</span></a></div>
170 <div class="u18-title"><a href="" class="blog-name"></a></div>
171 </div>
172
173 <div class="u18w3">
174
175 <div class="u18-search">
176 <div class="u18-searchlink">
177 <a href="#search" aria-label="Open Search Field"><span>Search</span></a>
178 </div>
179 <div class="u18-searchform">
180 <a class="u18-search-action" id="u18exitsearch" href="#exitsearch"
181 aria-label="Exit Search Field"><span>Exit Search Field</span></a>
182 <div id="search" class="scs-slot" data-allowed-items="[ 'scs-contentsearch' ]">
183 </div>
184 <a class="u18-search-action" id="u18clearsearch" href="#clearsearch"
185 aria-label="Clear Search Field" tabindex="0"><span>Clear Search Field</span></a>
186 </div>
187 </div>
188 <div class="u18-langdd u18-dd">
189 <div class="u18-langselect u18-ddlink">
190 <a href="#select-language" aria-label="Select Language" role="button"><span
191 class="globe">Select Language</span></a>
192 </div>
193
194 <div class="u18-langoptions u18-menu">
195 <ul class="languagelist" id="languagelist">
196 <li><a href="#" class="u18v1w5v1"></a></li>
197 </ul>
198 </div>
199 </div>
200
201 <div class="u18-menudd u18-dd">
202 <div class="u18-hamburger u18-ddlink">
203 <a href="#menu" aria-label="Menu" aria-haspopup="true"
204 role="button"><span>Menu</span></a>
205 </div>
206 <div class="u18-menuoptions u18-menu" aria-hidden="true">
207 <div id="menu" class="slide-menu">
208 <!-- <ul class="ul.icn-list" id="u18-subview"> -->
209 <div id="blogs-category-nav" class="scs-slot"
210 data-allowed-items="[ 'Blogs-Redwood-Category-Nav' ]"><div class="scs-row"><div class="scs-col" style="width: 100%;"><div id="f62eb3cd-6ac8-407d-9e06-69cbbc8d821e"><div class="scs-component-bounding-box"><!-- -->
211 <div>
212 <div class="scs-custom-component scs-component sampleComp-default-style" style="margin-top:5px;margin-right:5px;margin-bottom:5px;margin-left:5px;">
213 <div class="scs-component-content" style="width:100%;">
214 <div style="" class="scs-custom-component-wrapper">
215 <div id="f62eb3cd-6ac8-407d-9e06-69cbbc8d821ecustomComponentDiv" data-scs-hydrate="true" data-asset-operation="view:CORE8B88E20204C04A0DADCEBC0499683C49">
216 <div class="blogs-nav">
217
218 <span class="h2-nav categories-text">CATEGORIES</span>
219 <ul class="ul.icn-list" id="u18-subview">
220
221
222
223 <li class="mainMenu hasNoMenu">
224 <a class="categ-menu" href="../category/lnx-announcements">Announcements</a>
225 </li>
226
227
228
229
230 <li class="mainMenu hasNoMenu">
231 <a class="categ-menu" href="../category/lnx-events">Events</a>
232 </li>
233
234
235
236
237 <li class="mainMenu hasNoMenu">
238 <a class="categ-menu" href="../category/lnx-oracle-cloud-infrastructure">Oracle Cloud Infrastructure</a>
239 </li>
240
241
242
243
244 <li class="mainMenu hasNoMenu">
245 <a class="categ-menu" href="../category/lnx-partners">Partners</a>
246 </li>
247
248
249
250
251 <li class="mainMenu hasNoMenu">
252 <a class="categ-menu" href="../category/lnx-perspectives">Perspectives</a>
253 </li>
254
255
256
257
258 <li class="mainMenu">
259 <a class="hasMenu active categ-menu" href="../category/lnx-ksplice">Technologies</a>
260 <div class="sub-categories">
261 <span class="back-btn" style="display: none;"><a href="javascript:void(0)">Back</a></span>
262
263 <ul style="margin: 0 !important">
264 <li>
265 <a href="../category/lnx-technologies">Technologies</a>
266 </li>
267 <li>
268 <a href="../category/lnx-ksplice">Ksplice</a>
269 </li>
270 <li>
271 <a href="../category/lnx-linux-kernel-development">Linux Kernel Development</a>
272 </li>
273 <li>
274 <a href="../category/lnx-linux-toolchain-and-tracing">Linux Toolchain & Tracing</a>
275 </li>
276 </ul>
277
278 </div>
279 </li>
280
281
282
283 <li class="mainMenu hasNoMenu">
284 <a class="categ-menu" href="../category/lnx-training">Training</a>
285 </li>
286
287
288 </ul>
289
290 <div class="u18-navdivider"></div>
291
292
293 <ul>
294 <li class="h2-nav related-content">RELATED CONTENT</li>
295 <div id="related-content">
296 <li><a href="#">Wim Coekaert's blog</a></li>
297 <li><a href="#">Hardware Cert. List</a></li>
298 <li><a href="#">ISV Catalog </a></li>
299 <li><a href="#">Validated Configs </a></li>
300 <li><a href="#">Developers</a></li>
301 <li><a href="#">GitHub</a></li>
302 <li><a href="#">Open Source</a></li>
303 </div>
304 </ul>
305 <div class="u18-navdivider"></div>
306 </div>
307
308
309 <div class="hydrated-container" data-hydrated="{"contentId":"CORE8B88E20204C04A0DADCEBC0499683C49","categories":["Announcements","Events","Oracle Cloud Infrastructure","Partners","Perspectives","Technologies","Training"],"compiledSite":true}"></div>
310 <!-- <script>
311
312 function showCategories() {
313 document.getElementsByClassName("categ-menu").classList.remove("categ-active");
314 document.getElementsByClassName("mainMenu").style.display = "";
315 document.getElementsByClassName("back-btn").classList.remove("show");
316 document.getElementsByClassName("categories-text").style.display = "";
317 document.getElementsByClassName("back-btn").style.display = "none";
318 document.getElementsByClassName("sub-categories").classList.remove("active");
319 document.getElementsByClassName("hasMenu").classList.add("active");
320 }
321
322 function showSubCategories(eventTarget) {
323 eventTarget.classList.add("categ-active");
324 document.getElementsByClassName("mainMenu").not(eventTarget).each(function () {
325 this.style.display = "none";
326 });
327 document.querySelectorAll('.sub-categories.active li:first-child a').focus();
328 document.getElementsByClassName("categories-text").style.display = "none";
329 document.getElementsByClassName("categ-active").next('.back-btn').classList.add("show");
330 eventTarget.parent('.mainMenu').style.display = "";
331 eventTarget.siblings('.sub-categories').classList.add("active");
332 eventTarget.classList.remove("active");
333 document.getElementsByClassName("back-btn").style.display = "";
334 document.querySelectorAll(".sub-categories.active li:last-child a").addEventListener('keydown', function (e) {
335 if (e.keyCode == 9) {
336 showCategories();
337 document.getElementsByClassName("hasMenu").classList.add("active");
338 }
339 });
340
341 }
342
343 document.on('click', '.hasMenu', function (e) {
344 e.preventDefault();
345 showSubCategories(this);
346 });
347 document.on('click', '.back-btn', function (e) {
348 showCategories();
349
350 });
351 document.getElementsByClassName("back-btn").keydown(function (e) {
352 if (e.keyCode == 9) {
353 showCategories();
354 $('.categ-menu.active').parent('.mainMenu').next('li').find('a').focus();
355 }
356 });
357
358 document.getElementsByClassName("hasMenu").keydown(function (e) {
359 if (e.keyCode == 9) {
360 showSubCategories(this);
361 }
362 });
363 document.getElementsByClassName("mainMenu").keydown(function (e) {
364 if (e.shiftKey && e.keyCode == 9) {
365 let hasMenuElement = this.prev('li').find('a').classList.contains("hasMenu");
366 if (hasMenuElement) {
367 e.preventDefault();
368 showSubCategories(this.prev('li').find('a.hasMenu'));
369 document.querySelectorAll(".sub-categories.active li:last-child a").focus();
370 }
371 }
372 })
373
374
375 let hydrateData = document.getElementsByClassName("hydrated-container")[0].getAttribute('data-hydrated');
376
377 if (hydrateData) {
378
379 var data = JSON.parse(hydrateData);
380 var postCategories = data.categories ? data.categories : [];
381 var metatags = document.getElementsByTagName("meta");
382 for (var i = 0; i < metatags.length; i++) {
383 if (metatags[i].name === "category" && postCategories.length !== 0) {
384 document.getElementsByTagName("meta")[i].content = postCategories.join();
385 }
386 if (metatags[i].name === "keywords" && postCategories.length !== 0) {
387 document.getElementsByTagName("meta")[i].content = postCategories.join();
388 }
389
390 }
391 }
392
393 </script> -->
394 </div>
395 </div>
396 </div>
397 </div>
398 </div>
399 </div></div></div></div></div>
400 <!-- </ul> -->
401 </div>
402
403 <ul>
404 <li><a href="" class="homepage">Blogs Home</a></li>
405 <li><a href="" class="blogdirectory">Blogs Directory</a></li>
406 <li><a href="" class="authordirectory">Featured Authors</a></li>
407
408 <li><a href="" class="rss-link">RSS</a></li>
409 </ul>
410 </div>
411
412 </div>
413
414 </div>
415
416 </div>
417 </nav>
418 <a id="maincontent"></a>
419 </div>
420 <!-- /U18v2 -->
421 <!-- RH03v5 -->
422 <section class="rh03 rh03v5 rw-ocean-150bg rw-pattern16w rw-pattern-15p rw-strip rw-strip-custom social">
423 <div class="rh03w1 cwidth social-wrapper">
424
425 <!-- <div class="rh03bc">
426
427
428 <div class="rh03bc">
429 <div class="rh03bc1">
430 <ol>
431 <li><a href="placeholder.html">Oracle blogs</a></li>
432 <li><a href="placeholder.html">Lorem ipsum dolor</a></li>
433 </ol>
434 </div>
435 </div>
436
437
438 </div> -->
439
440 <div class="rh03pgtitle">
441 <div class="blog-name"></div>
442 <div class="rh03subtitle">
443 <p></p>
444 </div>
445 <!-- <div id="bannerdescription"></div> -->
446 </div>
447 <div class="social-share-wrapper">
448 <label id="social-share">Follow: </label>
449 <ul class="social-share" aria-labelledby="social-share">
450 <li>
451 <a href="" title="Oracle blog RSS" class="icn-rss" target="_blank">
452 <span class="sr-only">RSS</span>
453 </a>
454 </li>
455 <li>
456 <a href="" title="Oracle blog on Facebook" class="icn-facebook" id="facebook-url" target="_blank">
457 <span class="sr-only">Facebook</span>
458 </a>
459 </li>
460 <li>
461 <a href="" title="Oracle blog on Twitter" class="icn-twitter" id="twitter-url" target="_blank">
462 <span class="sr-only">Twitter</span>
463 </a>
464 </li>
465 <li>
466 <a href="" title="Oracle blog on Linkedin" class="icn-linkedin" id="linkedin-url" target="_blank">
467 <span class="sr-only">LinkedIn</span>
468 </a>
469 </li>
470 <li>
471 <a href="" title="Oracle blog on Youtube" class="icn-youtube" id="youtube-url" target="_blank">
472 <span class="sr-only">Youtube</span>
473 </a>
474 </li>
475 <li>
476 <a href="" title="Oracle blog on Instagram" class="icn-instagram" id="instagram-url" target="_blank">
477 <span class="sr-only">Instagram</span>
478 </a>
479 </li>
480 </ul>
481 </div>
482
483 </div>
484 <div class="rh03customstrip" data-bgimg="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/img/rwstrip-blogs-fpo.png"></div>
485 </section>
486 <!-- /RH03v5 -->
487 <div class="scs-slot" id="post-id"><div class="scs-row"><div class="scs-col" style="width: 100%;"><div id="a37b49d5-e11f-4e1f-a5e0-fd37af71a288"><div class="scs-component-bounding-box"><!-- -->
488 <div>
489 <div class="scs-custom-component scs-component scs-component-default-style" style="margin-top:5px;margin-right:5px;margin-bottom:5px;margin-left:5px;">
490 <div class="scs-component-content" style="width:100%;">
491 <div style="" class="scs-custom-component-wrapper">
492 <div id="a37b49d5-e11f-4e1f-a5e0-fd37af71a288customComponentDiv" data-scs-hydrate="true" data-scs-contenttype="Blog-Post" data-asset-operation="view:CORE8B88E20204C04A0DADCEBC0499683C49">
493 <style>
494 .title {
495 background-color: #fff;
496 border: 1px solid #F1EFED;
497 border-radius: 22px;
498 max-width: 940px;
499 margin: 0 auto;
500 padding: 5px 25px;
501 }
502 </style>
503 <!-- RC81v1 -->
504
505 <section class="rc81 rc81v1 cpad">
506
507 <div class="rc81w1 bwidth">
508
509 <div class="rc81">
510 <ul>
511 <li class="post-categories"><a href="../category/lnx-technologies" class="rc81accent"> Technologies<span>, </span> </a></li>
512 <li class="post-categories"><a href="../category/lnx-linux-kernel-development" class="rc81accent"> Linux Kernel Development<span>, </span> </a></li>
513 </ul>
514
515 </div>
516 <p class="rc81accent" id="categories"></p>
517 <h1>Syscall latency... and some uses of speculative execution</h1>
518 <span id="publishdate">September 12, 2023 |</span><span id="publishdate"> 23 minute read</span>
519
520 <div class="rc81sub ">
521 <img src="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/img/ui_defaultuserimage.jpg" alt="">
522
523 <span><a id="postAuthorName" href="/authors/ankur-arora">Ankur Arora</a>
524 <div><span class="rc81title rw-neutral-200txt"></span>
525
526 </div>
527 </span></div>
528
529
530 <!--
531 <div class="rc81photo">
532 <img src="">
533 </div>
534 -->
535
536 </div>
537 </section>
538 <!-- /RC81v1 --><!-- RC82v0 -->
539 <!-- /RC82v0 --><!-- RC86v0 -->
540 <section class="rc86 rc86v0 cpad">
541 <div class="rc86w1 bwidth">
542 <div class="rc86social">
543 <a href="https://www.facebook.com/dialog/share?app_id=209650819625026&href=../post/syscall-latency" class="sharelink icn-img icn-facebook" aria-label="Share post on Facebook" data-sharetype="facebook">
544 <!-- <span>Facebook</span> -->
545 </a>
546 <a href="https://twitter.com/share?url=../post/syscall-latency" class="sharelink icn-img icn-twitter" aria-label="Share post on Twitter" data-sharetype="twitter">
547 <!-- <span>Twitter</span> -->
548 </a>
549 <a href="https://www.linkedin.com/shareArticle?url=../post/syscall-latency" aria-label="Share post on Linkedin" class="sharelink icn-img icn-linkedin" data-sharetype="linked-in">
550 <!-- <span>LinkedIn</span> -->
551 </a>
552 <a href="placeholder.html" class="sharelink icn-img icn-email" aria-label="Share post on Email" data-sharetype="email">
553 <!-- <span>Email</span> -->
554 </a>
555 </div>
556 </div>
557 </section>
558 <!-- /RC86v0 -->
559
560 <!-- RC84v0 -->
561 <section class="rc84v0 rc84zoom ">
562 <div class="rc84w1 bwidth">
563 <div class="rc84zoomui">
564 <b>Text Size <span id="rc84fs">100%</span>:</b>
565 <div>
566 <a href="#smaller-text" class="rc84-smaller" aria-label="decrease font size to 90%">-</a>
567 <a href="#larger-text" class="rc84-larger" aria-label="increase font size to 110%">+</a>
568 </div>
569 </div>
570
571
572 <div class="rc84post">
573
574 <!-- RC84v1 -->
575 <section class="rc84 rc84v1">
576
577 <h2 id="introduction">Introduction</h2>
578
579 <p>Moving from UEK5 to UEK6 brought about an unwelcome surprise: an increase in syscall latency on some x86 systems. The root cause, as we will see, was slightly slower evaluation of audit rules, which, given that they are evaluated for every syscall, is not great.</p>
580
581 <p>In this post we start off by exploring the root cause which turns out to not be UEK specific, it also impacts upstream kernels as well. Then we detail the fixes and how they take advantage of the speculative out-of-order nature of the CPU pipeline.</p>
582
583 <p>The changes, even though they target low-level optimizations, are quite straight-forward, almost trivial.</p>
584
585 <h3 id="background">Background</h3>
586
587 <p>Execution latency of the <code style="background:#eeeeee;border:1px solic #cccccc;">getpid()</code>[1] increased by about 15% (measured on an Intel Skylake-X system), from 191ns on UEK5, to 217ns on UEK6.</p>
588
589 <p>This was measured in the usual way:</p>
590
591 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">clock_gettime(CLOCK_MONOTONIC, &start);
592 for (i = 0; i < large_number; i++)
593 syscall(SYS_getpid);
594 clock_gettime(CLOCK_MONOTONIC, &stop);</pre>
595
596 <p>A quick <code style="background:#eeeeee;border:1px solic #cccccc;">perf record</code>, showed that almost all of the increased latency was in <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code> which was more expensive in UEK6.</p>
597
598 <p>Oracle Exadata, where this problem was seen has 37 audit rules that are evaluated in the syscall path. Since audit only wants to log unusual or exceptional events, the benchmark would evaluate these rules in every iteration, but never generate any output. Essentially, purely local computation that became slower without there having been any material changes to the audit code or in the audit rules.</p>
599
600 <h3 id="cpu-parameters">CPU-parameters</h3>
601
602 <p>Some Intel Skylake-X parameters that we'll make use of later:</p>
603
604 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">L1-load-latency: 4-6 cycles
605 L2-load-latency: 14 cycles
606 L1-cache-size: 32K (512 cachelines: 64 sets, 8 ways each)
607
608 ROB size: 224 micro-ops</pre>
609
610 <p>The parameters are taken from the <a href="https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html">Intel SDM</a>.</p>
611
612 <p><strong>Note:</strong> L1/L2 being the respective data-cache level and ROB, being the Reorder Buffer, where instructions are staged for in-order retirement.</p>
613
614 <h2 id="root-cause-analysis">Root cause analysis</h2>
615
616 <p>Drilling down with <code style="background:#eeeeee;border:1px solic #cccccc;">perf stat -d</code>:</p>
617
618 <p>UEK5 (191 ns):</p>
619
620 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;"># perf stat -d -r 5 ./getpid
621 # output normalized for a single getpid() call
622
623 677.9 cycles # 3.542 GHz
624 1635.0 instructions # 2.40 insn per cycle
625 325.0 branches
626 0.5 branch-misses # 0.16% of all branches
627 404.0 L1-dcache-loads
628 0.4 L1-dcache-load-misses # 0.10% of all L1-dcache accesses</pre>
629
630 <p>UEK6 (217ns):</p>
631
632 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;"># perf stat -d -r 5 ./getpid
633 # output normalized for a single getpid() call
634
635 770.4 cycles # 3.545 GHz
636 1652.0 instructions # 2.14 insn per cycle
637 332.2 branches
638 1.5 branch-misses # 0.45% of all branches
639 407.3 L1-dcache-loads
640 8.6 L1-dcache-load-misses # 2.13% of all L1-dcache accesses</pre>
641
642 <p>Comparing, this is an increase of ~100 cycles with the L1d-loads and instruction counts being almost identical across UEK5 and UEK6. This underscores the fact that audit code which forms the bulk of instructions executed hasn’t changed all that much.</p>
643
644 <p>The IPC is commensurately lower[2]. The proximal cause seems to be the increased L1d-load-misses and the one extra branch-miss.</p>
645
646 <p>These observations were confirmed via enough non-correlated runs (with intervening reboot for each) and so are statistically significant. The L1d-load-miss numbers are somewhat variable across boot cycles, but the trend is close to what we see above.</p>
647
648 <h3 id="audit_filter_syscall"><code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code></h3>
649
650 <p>From <code style="background:#eeeeee;border:1px solic #cccccc;">perf record</code> we know that the bulk of the increased runtime went to <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code>. The procedure itself is primarily a loop that walks the list of rules, calling <code style="background:#eeeeee;border:1px solic #cccccc;">audit_in_mask()</code> for each rule to check if it needs to be evaluated for the current syscall. For <code style="background:#eeeeee;border:1px solic #cccccc;">getpid()</code> the answer will be <code style="background:#eeeeee;border:1px solic #cccccc;">false</code> most of the time (32 of 37 times.)</p>
651
652 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">audit_filter_syscall(...) {
653 struct audit_entry *e;
654 struct audit_entry *ctx;
655
656 list = audit_filter_list[AUDIT_FILTER_EXIT];
657
658 list_for_each_entry_rcu(e, list, list) {
659
660 if (audit_in_mask(&e->rule, ctx->major) &&
661 audit_filter_rules(tsk, &e->rule, ctx, NULL,
662 &state, false, x)) {
663 rcu_read_unlock();
664 ctx->current_state = state;
665 return state;
666 }
667 }
668
669
670 }
671
672 audit_in_mask(const struct audit_krule *rule, unsigned long val) {
673 if (val > 0xffffffff)
674 return false;
675
676 /*
677 * val contains the current syscall number. AUDIT_WORD does
678 * some bit shifting on it.
679 */
680 word = AUDIT_WORD(val);
681 if (word >= AUDIT_BITMASK_SIZE)
682 return false;
683
684 bit = AUDIT_BIT(val);
685
686 /*
687 * The load in rule->mask[word] depends on the audit_krule (which
688 * hangs off the current rule entry) and the syscall number.
689 */
690 return rule->mask[word] & bit;
691 }
692 audit_filter_rules(...) {
693 /*
694 * Large switch statement which we ignore for the rest of this
695 * analysis because, as we will see later, loads executed in it don't
696 * have an "interesting" alignment and so their latency should be easy
697 * enough to hide.
698 */
699 }</pre>
700
701 <h3 id="memory-accesses">Memory accesses</h3>
702
703 <p>Next let’s look at the data structures accessed in the <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code> loop and where the L1d-load-misses might be coming from.</p>
704
705 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">/* Data structure layout annotated with size and cacheline occupancy
706 * information using pahole. */
707
708 struct audit_entry { /* via audit_filter_list[AUDIT_FILTER_EXIT] */
709
710 struct list_head list; /* 0 16 */
711 struct callback_head rcu; /* 16 16 */
712 struct audit_krule rule; /* 32 376 */
713 ...
714 /* size: 408, cachelines: 7, members: 3 */
715 /* last cacheline: 24 bytes */
716 };
717
718 struct audit_krule { /* inlined in struct audit_entry */
719 ...
720 u32 mask[64]; /* 16 256 */
721 ...
722 /* size: 376, cachelines: 6, members: 17 */
723 /* last cacheline: 56 bytes */
724 };
725
726 struct audit_context {
727 ...
728 int major; /* 20 4 */
729 ...
730 /* size: 920, cachelines: 15, members: 46 (slightly larger on UEK6) */
731 /* sum members: 912, holes: 2, sum holes: 8 */
732 /* last cacheline: 24 bytes */
733 };</pre>
734
735 <p>The effective execution loop in <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code> (with cacheline access annotations):</p>
736
737 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">struct audit_entry *e = &audit_filter_list[AUDIT_FILTER_EXIT];
738
739 for_each_iteration {
740 e = e->next; /* cacheline-0 of audit_entry */
741 if (e == list)
742 jmp out;
743 if (audit_in_mask(e->rule.mask, /* cacheline-0 of audit_entry */
744 ctx->major)) /* cacheline-0 of audit_context */
745 audit_filter_rules(e->rule);
746 }
747 out:</pre>
748
749 <p>As the annotations above mention, there are a total of three loads:</p>
750
751 <ol type="1">
752 <li>Pointer chasing in <code style="background:#eeeeee;border:1px solic #cccccc;">e->next</code>: the first cacheline of <code style="background:#eeeeee;border:1px solic #cccccc;">struct audit_entry</code>.</li>
753 <li><code style="background:#eeeeee;border:1px solic #cccccc;">e->rule.mask[]</code>: accesses the same cacheline as load (1) above.</li>
754 <li><code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code>: accesses the first cacheline of <code style="background:#eeeeee;border:1px solic #cccccc;">struct audit_context</code>.</li>
755 </ol>
756
757 <p>Loads (1) and (2) will access a total of 37 cachelines, corresponding to a rule per iteration. Also notice that every single basic block in the rest of the iteration (apart from some error checking in <code style="background:#eeeeee;border:1px solic #cccccc;">audit_in_mask()</code>) has data dependence on the evaluation of <code style="background:#eeeeee;border:1px solic #cccccc;">e=e->next</code>. Worse this is a loop carried dependency, so each iteration depends on the previous one.</p>
758
759 <p>The cacheline for load (3) is accessed once every iteration. This load is unnecessary, <code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code> contains the syscall number, which is a constant for the duration of the syscall. However, because the compiler’s alias analysis cannot prove that <code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code> is not mutilated, it does not get cached in a register. This also means that <code style="background:#eeeeee;border:1px solic #cccccc;">audit_in_mask()</code> will do out-of-bound validation checks related to <code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code> over and over.</p>
760
761 <p>Recalling the <code style="background:#eeeeee;border:1px solic #cccccc;">perf-stat -d</code> output above there are a total of around 400 L1d-loads for each <code style="background:#eeeeee;border:1px solic #cccccc;">getpid()</code> call. Of those, the loop does a total of 37*3 loads which map to a total of 38 unique cachelines.</p>
762
763 <p>Alright, I hear you think: granted, walking linked-lists is difficult, there are a lot of cachelines in a lot of iterations or whatever, life is hard and the compiler doesn’t know what it is doing[3]. Even given all of that, nothing here has changed from UEK5 to UEK6, so none of this explains why UEK6 would incur more L1d-load-misses[4].</p>
764
765 <p>Which is true, so that’s next.</p>
766
767 <h3 id="theory-of-the-case">Theory of the case</h3>
768
769 <p>From the background above, we know that the loop is pure computation, and purely local computation at that, so code changes elsewhere should have no effect. And there were no significant code changes from UEK5 to UEK6, so the loop is unchanged (which also applies to the generated assembly.)</p>
770
771 <p>Now insofar as L1d-load-misses are concerned: the number of cachelines accessed (from about 400 L1d-loads per <code style="background:#eeeeee;border:1px solic #cccccc;">getpid()</code> call, not all of which are to unique cachelines) amount to a number comfortably below the Skylake-X L1d-cache capacity of 512 cachelines. So this loop should not incur any capacity misses.</p>
772
773 <p>Which leaves conflict misses as the probable cause[5]. Skylake-X has an 8-way associative L1: if more than 8 loads in the loop map to the same cache-set some accesses would incur conflict misses.</p>
774
775 <p>Accesses in the loop and how they map to cache-sets:</p>
776
777 <ul>
778 <li><code style="background:#eeeeee;border:1px solic #cccccc;">struct audit_entry</code>: aligns at a 512B boundary, which limits it to cache-sets <code style="background:#eeeeee;border:1px solic #cccccc;">{0, 8, 16, ... 56}</code>, for a total of 8*8 cache-slots.</li>
779 <li><code style="background:#eeeeee;border:1px solic #cccccc;">struct audit_context</code>: aligns at a 1024B boundary, which resolves to cache-sets <code style="background:#eeeeee;border:1px solic #cccccc;">{0, 16, 32, 48}</code>, for a total of 4*8 cache-slots. As described earlier, this is a single cacheline which competes with a subset of the <code style="background:#eeeeee;border:1px solic #cccccc;">struct audit_entry</code> cachelines.</li>
780 </ul>
781
782 <p>Even then, this is 37 cachelines slotted into 64 slots and another slotting into 32 of those 64. This should be easy enough to satisfy, assuming that the kernel allocator has a reasonably sane distribution and isn’t skewed towards a particular set of cachelines (or is similarly skewed on both UEK5 and UEK6.)</p>
783
784 <h3 id="allocation-skew">Allocation skew</h3>
785
786 <p>If, allocations for <code style="background:#eeeeee;border:1px solic #cccccc;">struct audit_entry</code> were distributed uniformly, they would map into cache-sets uniformly, ending with similar populations across the cache-sets. This would give a cacheline-spread metric of ~0 (obtained by calculating the standard-deviation of populations across cache-sets.)</p>
787
788 <p>What we see:</p>
789
790 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">cacheline-spread on UEK5: 1.58
791 cacheline-spread on UEK6: 1.91</pre>
792
793 <p>(These results are from a large number (> 100) of non-correlated runs. <code style="background:#eeeeee;border:1px solic #cccccc;">auditd</code> allocates at boot, so this was done by rebooting between each run.)</p>
794
795 <p>From these numbers, UEK5 is far from a flat distribution, and UEK6 is somewhat worse, but not dispositively so. Additionally, a slight imbalance will not cause performance degradation: that happens only after cache conflicts kick in, which is after cache-set population crosses the associativity threshold.</p>
796
797 <p>To validate this, we measure how well cycles correlate[6] with 1) with L1d-misses, and 2) cacheline-spread:</p>
798 <style type="text/css">.divTable {
799 display: table;
800 width: 80%;
801 }
802 .divTableRow {
803 display: table-row;
804 }
805 .divTableHeading {
806 display: table-header-group;
807 background-color: #ddd;
808 font-weight: bold;
809 }
810 .divTableCell {
811 display: table-cell;
812 padding: 3px 10px;
813 border: 1px solid #999999;
814 }
815 </style>
816 <p> </p>
817
818 <div class="divTable">
819 <div class="divTableHeading">
820 <div class="divTableCell">Kernel</div>
821
822 <div class="divTableCell">cycles:L1d-misses</div>
823
824 <div class="divTableCell">cycles:cacheline-spread</div>
825 </div>
826
827 <div class="divTableRow">
828 <div class="divTableCell">UEK5</div>
829
830 <div class="divTableCell">0.74</div>
831
832 <div class="divTableCell">0.22</div>
833 </div>
834
835 <div class="divTableRow">
836 <div class="divTableCell">UEK6</div>
837
838 <div class="divTableCell">0.74</div>
839
840 <div class="divTableCell">0.61</div>
841 </div>
842 </div>
843
844 <p> </p>
845
846 <p>For both UEK5 and UEK6, “cycles:L1d-misses” is tightly correlated (though the value of 0.74 for both is happenstance) which makes sense. “cycles:cacheline-spread”, however, is well correlated only on UEK6, not UEK5. This suggests that the UEK6 allocator skew is meaningfully worse, enough to cause lower performance.</p>
847
848 <p>Alright, having beaten this dead horse enough, let’s figure out how to fix it next[7].</p>
849
850 <h2 id="speeding-it-up">Speeding it up</h2>
851
852 <p>To get back our lost performance, our task is simple: optimize a hot-loop[8] which is itself executed in the hot syscall path. Compounding the problem, the critical load in the loop is accessed via a linked list.</p>
853
854 <p>Stated like that, it sounds pretty bad. But, as we will see the structure of the problem helps quite a bit:</p>
855
856 <ol type="1">
857 <li>On a sane system, the common-case is extremely common, syscalls are frequent, and audit logging is unusual. This means that low branch mispreds are not unusual and something we might even depend on.</li>
858 <li>We are optimizing a no-op loop: the loop walks a bunch of rules, does error checking, and decides if it needs to log. In the common-case, it will conclude that it doesn’t. (This is really (1) restated to stress the no-op nature of the loop.)</li>
859 </ol>
860
861 <p>A no-op loop implies that the code does not actually care about most of the values it computes. It just inches towards a foregone conclusion.</p>
862
863 <p>This it does (as all code does) by means of dependency chains that transform the input state to output. Here, most dependency chains are short and, are really <em>only used to predict the control flow</em>. The only long dependency chain, woven through all the loop iterations, is the one walking the linked-list.</p>
864
865 <p>Now, critically since the branches are predicted perfectly or almost so, the control flow can run quite a bit further than any loads and dependent computation. The control flow thus essentially feeds these loads and other instructions to the ROB, where they wait until resources/dependencies become available, compute the output from their chain which, to reiterate, will only be used to predict the control flow.</p>
866
867 <p>Given that the control flow is already feeding instructions from the correct direction, these are in effect orphan chains that eventually retire without anyone having cared for the output they compute or how long that took.</p>
868
869 <p>Except: this happy state continues only until we run into a resource constraint. For instance, the size of the ROB on Skylake-X is 224 entries and each loop iteration is ~20 instructions. This means instructions worth around 10 loop iterations can be present in the ROB. Now, given that instructions retire on x86 in-order, long running instructions (L1d-load-misses of course, but also L1d-load hits[9]) with long dependence chains would slow retirement down, even were control-flow to be predicted perfectly.</p>
870
871 <p>Bearing these observations in mind, our fixes will try to reduce the amount and cost of work per loop iteration. This allows the loop to retire as close to the gating latency of any long running instructions in the loop.</p>
872
873 <h3 id="cache-ctx-major-in-audit_filter_syscall">Cache <code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code> in <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code></h3>
874
875 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">@@ -785,13 +785,14 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
876 {
877 struct audit_entry *e;
878 enum audit_state state;
879 + unsigned long major = ctx->major;
880
881 if (auditd_test_task(tsk))
882 return AUDIT_DISABLED;
883
884 rcu_read_lock();
885 list_for_each_entry_rcu(e, list, list) {
886 - if (audit_in_mask(&e->rule, ctx->major) &&
887 + if (audit_in_mask(&e->rule, major) &&
888 audit_filter_rules(tsk, &e->rule, ctx, NULL,
889 &state, false)) {
890 rcu_read_unlock();</pre>
891
892 <p>Caching <code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code> in a local variable helps in two ways:</p>
893
894 <ul>
895 <li>Explicitly indicates to the compiler that there are no stores to the cached value. <code style="background:#eeeeee;border:1px solic #cccccc;">audit_in_mask()</code> operates on <code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code> doing some bit-shifting and error checking. Now that the compiler knows that <code style="background:#eeeeee;border:1px solic #cccccc;">major</code> is not modified, it can hoist most of that logic out of the loop so it is not reevaluated over-and-over in every loop iteration.</li>
896 <li>As described earlier, <code style="background:#eeeeee;border:1px solic #cccccc;">struct audit_context</code> has similar natural alignment concerns as <code style="background:#eeeeee;border:1px solic #cccccc;">struct audit_entry</code>. Allowing the compiler to cache <code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code> in a register (or on the stack) reduces one potential source of contention.</li>
897 </ul>
898
899 <p>With this change the number of instructions executed/loop-iteration reduce by 8 (of 20.) Note that most of those were almost free ALU instructions.</p>
900
901 <p>L1d-loads: we removed one L1d-load but added two (due to the compiler now spilling and reloading some state to/from the stack.) However, given that stack accesses are much less likely to have conflicting alignment constraints, the increased loads are less of a concern than the one we got rid of.</p>
902
903 <p>cycles: improve by about 40 cycles. This is because the greater room in the ROB allows our almost perfect branch prediction to speculatively run even further ahead of other instructions.</p>
904
905 <p>Change in latency for UEK6:</p>
906
907 <div class="divTable">
908 <div class="divTableHeading">
909 <div class="divTableCell">Version</div>
910
911 <div class="divTableCell">Min<br>
912 (ns)</div>
913
914 <div class="divTableCell">Mean<br>
915 (ns)</div>
916
917 <div class="divTableCell">Median<br>
918 (ns)</div>
919
920 <div class="divTableCell">Max<br>
921 (ns)</div>
922 </div>
923
924 <div class="divTableRow">
925 <div class="divTableCell">baseline</div>
926
927 <div class="divTableCell">196.26</div>
928
929 <div class="divTableCell">212.00</div>
930
931 <div class="divTableCell">207.80</div>
932
933 <div class="divTableCell">240.52</div>
934 </div>
935
936 <div class="divTableRow">
937 <div class="divTableCell">ctx->major</div>
938
939 <div class="divTableCell">183.50</div>
940
941 <div class="divTableCell">201.41</div>
942
943 <div class="divTableCell">198.80</div>
944
945 <div class="divTableCell">226.93</div>
946 </div>
947 </div>
948
949 <p> </p>
950
951 <p>From the min-max range, there is a rather large variation in latency that’s caused by variations in allocation resulting in high or low cacheline-spread. In almost all cases though, the latency improves by ~10ns or thereabouts.</p>
952
953 <p>That said, after removing 8 instructions and one load (and adding two less consequential loads), the performance gain is rather miniscule: ~1 cycle/iteration. Just that the loop executes 37 times, so we make it up in volume.</p>
954
955 <p>More details (<code style="background:#eeeeee;border:1px solic #cccccc;">perf-stat</code> and the before/after versions of the generated code) in <a href="https://github.com/oracle/linux-uek/commit/87a39a3d2ca9a5c7e4d35e4cf4b839c53cc0678d">UEK6 commit-1</a> and in <a href="https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/kernel/auditsc.c?id=069545997510833281f45f83e097017b9fef19b7">Upstream commit-1</a>.</p>
956
957 <h3 id="annotate-branch-direction-for-audit_in_mask">Annotate branch direction for <code style="background:#eeeeee;border:1px solic #cccccc;">audit_in_mask()</code></h3>
958
959 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">@@ -790,12 +790,13 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
960 rcu_read_lock();
961 list_for_each_entry_rcu(e, list, list) {
962 - if (audit_in_mask(&e->rule, major) &&
963 - audit_filter_rules(tsk, &e->rule, ctx, NULL,
964 - &state, false)) {
965 ...
966 + if (unlikely(audit_in_mask(&e->rule, major))) {
967 + if (audit_filter_rules(tsk, &e->rule, ctx, NULL,
968 + &state, false)) {</pre>
969
970 <p>Annotate <code style="background:#eeeeee;border:1px solic #cccccc;">audit_in_mask()</code> as <code style="background:#eeeeee;border:1px solic #cccccc;">unlikely()</code> to allow the compiler to pessimize the call to <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_rules()</code>. Two reasons for this change:</p>
971
972 <ul>
973 <li>The primary motivation was to get rid of the extra branch mispred. This change succeeds in that task but it is unclear why: there’s no significant change in the basic-block structure. The only change is from a branch inversion due to the unlikely clause.</li>
974 <li>The branch inversion means that the not-taken direction is chosen more often: 32/37 times (changing from 5/37 earlier.) The issue-latency for not-taken branches is 0.5-1 cycles, for taken branches 1-2 cycles[10] is slightly cheaper.</li>
975 </ul>
976
977 <p>L1d-loads: reduce by 2 for each loop iteration. This is because the spills and reloads introduced in the “Cache <code style="background:#eeeeee;border:1px solic #cccccc;">ctx->major</code>…” patch have now been shifted to the unlikely path (the prologue and epilogue of the <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_rules()</code> call.)</p>
978
979 <p>cycles: performance improves on average by ~30 cycles/call.</p>
980
981 <p>Change in latency for UEK6:</p>
982
983 <div class="divTable">
984 <div class="divTableHeading">
985 <div class="divTableCell">Version</div>
986
987 <div class="divTableCell">Min<br>
988 (ns)</div>
989
990 <div class="divTableCell">Mean<br>
991 (ns)</div>
992
993 <div class="divTableCell">Median<br>
994 (ns)</div>
995
996 <div class="divTableCell">Max<br>
997 (ns)</div>
998 </div>
999
1000 <div class="divTableRow">
1001 <div class="divTableCell">ctx->major</div>
1002
1003 <div class="divTableCell">183.50</div>
1004
1005 <div class="divTableCell">201.41</div>
1006
1007 <div class="divTableCell">198.80</div>
1008
1009 <div class="divTableCell">226.93</div>
1010 </div>
1011
1012 <div class="divTableRow">
1013 <div class="divTableCell">ctx->major+annot</div>
1014
1015 <div class="divTableCell">165.26</div>
1016
1017 <div class="divTableCell">188.72</div>
1018
1019 <div class="divTableCell">184.25</div>
1020
1021 <div class="divTableCell">230.34</div>
1022 </div>
1023 </div>
1024
1025 <p> </p>
1026
1027 <p>More details (<code style="background:#eeeeee;border:1px solic #cccccc;">perf-stat</code> and the before/after versions of the generated code) in <a href="https://github.com/oracle/linux-uek/commit/0288dbdbfb5768ad8ae8a445c72f523bcb99eca0">UEK6 commit-2</a>.</p>
1028
1029 <h3 id="remove-static-linkage-from-audit_filter_syscall">Remove static linkage from <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code></h3>
1030
1031 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">@@ -777,7 +777,7 @@ static bool audit_in_mask(const struct audit_krule *rule, unsigned long
1032 * also not high enough that we already know we have to write an audit
1033 * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
1034 */
1035 -static enum audit_state audit_filter_syscall(struct task_struct *tsk,
1036 +enum audit_state audit_filter_syscall(struct task_struct *tsk,
1037 struct audit_context *ctx,
1038 struct list_head *list)</pre>
1039
1040 <p><code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code> is only used locally in the file and so is marked <code style="background:#eeeeee;border:1px solic #cccccc;">static</code>. Additionally, it’s only ever called with a fixed <code style="background:#eeeeee;border:1px solic #cccccc;">list</code> value of <code style="background:#eeeeee;border:1px solic #cccccc;">&audit_filter_list[AUDIT_FILTER_EXIT])</code>.</p>
1041
1042 <p>GCC’s constant propagation pass makes use of these two things to, quite reasonably, const-propagate the third argument to the point of use.</p>
1043
1044 <p>This causes the exit check in the <code style="background:#eeeeee;border:1px solic #cccccc;">list_for_each</code> loop to look like this:</p>
1045
1046 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">audit_filter_syscall.constprop.18(task, ctx):
1047 0: 48 8b 1b mov (%rbx),%rbx
1048 3: 48 81 fb e0 67 ac 82 cmp $0xffffffff82ac67e0,%rbx
1049 ffffffff8118b5ed: R_X86_64_32S audit_filter_list+0x40
1050 10: 75 e2 jne start_iter</pre>
1051
1052 <p>while, without const-propagation it would have looked like this:</p>
1053
1054 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">audit_filter_syscall(task, ctx, list):
1055 0: 48 8b 1b mov (%rbx),%rbx
1056 3: 4c 39 e3 cmp %r12,%rbx
1057 6: 75 e6 jne start_iter</pre>
1058
1059 <p>Now either one ought to be alright, both <code style="background:#eeeeee;border:1px solic #cccccc;">cmp imm32,r</code> and <code style="background:#eeeeee;border:1px solic #cccccc;">cmp r,r</code> forms are equivalent with a latency of 1 cycle, and both are a single micro-op each.</p>
1060
1061 <p>The second form of the <code style="background:#eeeeee;border:1px solic #cccccc;">cmp</code>, however, can be macro-op fused with the <code style="background:#eeeeee;border:1px solic #cccccc;">jne</code>; not entirely sure if the first form can be[11]. The second form is also denser, though that’s not a concern here.</p>
1062
1063 <p>Disallowing GCC from making assumptions about calling contexts by removing the <code style="background:#eeeeee;border:1px solic #cccccc;">static</code> linkage from <code style="background:#eeeeee;border:1px solic #cccccc;">audit_filter_syscall()</code> forces it to pass the <code style="background:#eeeeee;border:1px solic #cccccc;">list</code> parameter in a register which results in a small performance improvement: ~20 cycles (about 0.5 cycles/loop iteration.)</p>
1064
1065 <p>Change in latency for UEK6:</p>
1066
1067 <div class="divTable">
1068 <div class="divTableHeading">
1069 <div class="divTableCell">Version</div>
1070
1071 <div class="divTableCell">Min<br>
1072 (ns)</div>
1073
1074 <div class="divTableCell">Mean<br>
1075 (ns)</div>
1076
1077 <div class="divTableCell">Median<br>
1078 (ns)</div>
1079
1080 <div class="divTableCell">Max<br>
1081 (ns)</div>
1082 </div>
1083
1084 <div class="divTableRow">
1085 <div class="divTableCell">ctx->major+annot</div>
1086
1087 <div class="divTableCell">165.26</div>
1088
1089 <div class="divTableCell">188.72</div>
1090
1091 <div class="divTableCell">184.25</div>
1092
1093 <div class="divTableCell">230.34</div>
1094 </div>
1095
1096 <div class="divTableRow">
1097 <div class="divTableCell">ctx->major+annot+extern</div>
1098
1099 <div class="divTableCell">159.88</div>
1100
1101 <div class="divTableCell">184.35</div>
1102
1103 <div class="divTableCell">177.62</div>
1104
1105 <div class="divTableCell">250.82</div>
1106 </div>
1107 </div>
1108
1109 <p> </p>
1110
1111 <p>More details (<code style="background:#eeeeee;border:1px solic #cccccc;">perf-stat</code> and the before/after versions of the generated code) in <a href="https://github.com/oracle/linux-uek/commit/5a74015e20bff63d1052359fbc2c3418e0f6bc4e">UEK6 commit-3</a> and, <a href="https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/kernel/auditsc.c?id=50979953c0c41e929e5f955800da68e1bb24c7ab">Upstream commit-3</a>.</p>
1112
1113 <h2 id="summary">Summary</h2>
1114
1115 <p>The audit subystem is fairly stable in the Linux kernel, not given to frequent changes. So it was puzzling when it became slower in recent kernels, and because a primary user is the syscall path, concerning[12].</p>
1116
1117 <p>The cause turned out to be higher skew in allocated buffers which results in more lopsided cache-set distribution.</p>
1118
1119 <p>The fixes compensate for the higher costs in the loop by taking advantage of the peculiarities of the execution path and optimizing for the speculative nature of the CPU pipeline.</p>
1120
1121 <p>The three patches, in sum reduce the overhead by about 30ns (~100 cycles).</p>
1122
1123 <p>Final <code style="background:#eeeeee;border:1px solic #cccccc;">perf stat -d -r 5</code> go from:</p>
1124
1125 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;"># perf stat -d -r 5 ./getpid
1126 # output normalized for a single getpid() call
1127
1128 cycles 761.65 ( +- 5.22% )
1129 instructions 1639.17 ( +- 0.00% )
1130 IPC 2.18 ( +- 5.50% )
1131 branches 328.21 ( +- 0.00% )
1132 branch-misses 1.37 ( +- 6.56% )
1133 L1-dcache-loads 404.35 ( +- 0.00% )
1134 L1-dcache-load-misses 7.99 ( +- 70.71% )</pre>
1135
1136 <p>to:</p>
1137
1138 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;"># perf stat -d -r 5 ./getpid
1139 # output normalized for a single getpid() call
1140
1141 cycles 669.09 ( +- 11.23% )
1142 instructions 1342.04 ( +- 0.00% )
1143 IPC 2.03 ( +- 9.85% )
1144 branches 328.19 ( +- 0.00% )
1145 branch-misses 0.56 ( +- 5.35% )
1146 L1-dcache-loads 384.31 ( +- 0.00% )
1147 L1-dcache-load-misses 5.77 ( +- 84.57% )</pre>
1148
1149 <p>This compares quite well to the UEK5-baseline:</p>
1150
1151 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;"># perf stat -d -r 5 ./getpid
1152 # output normalized for a single getpid() call
1153
1154 cycles 672.90 ( +- 1.65% )
1155 instructions 1622.08 ( +- 0.00% )
1156 IPC 2.41 ( +- 1.65% )
1157 branches 321.20 ( +- 0.00% )
1158 branch-misses 0.51 ( +- 0.00% )
1159 L1-dcache-loads 401.32 ( +- 0.00% )
1160 L1-dcache-load-misses 2.28 ( +- 59.62% )</pre>
1161
1162 <p>Note for non-Skylake-X architectures: Intel Icelake and AMD Milan (the other architectures tested) cope with L1d-load-misses much better so the baseline performance is much better.</p>
1163
1164 <p>With these patches, they only show a small improvement (~10ns): Icelake has a bigger L1d-cache (48K), and a much bigger ROB. Milan also has a bigger ROB and does memory renaming and bunch of other pipeline optimizations that limit the effect of these optimizations.</p>
1165
1166 <p><strong>Endnote:</strong> what I found personally instructive was how much C really is “a portable assembler” and the significant codegen (and performance) changes that can result from minimal changes to the code.</p>
1167
1168 <h2 id="references">References</h2>
1169
1170 <ol type="1">
1171 <li>
1172 <p><code style="background:#eeeeee;border:1px solic #cccccc;">getpid()</code> has a minimal kernel execution path (only does a PID lookup), and so is generally used to measure the overhead of the syscall path.</p>
1173 </li>
1174 <li>
1175 <p>Comparing the IPC for the audit-only portion shows that a starker drop:</p>
1176
1177 <pre class="brush: bash;" style="background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;">UEK5: 1427.0 instructions # 3.41 insn per cycle
1178 UEK6: 1432.0 instructions # 2.84 insn per cycle</pre>
1179 </li>
1180 <li>
1181 <p>Alas no, alias analysis is an undecidable problem.</p>
1182 </li>
1183 <li>
1184 <p>Or for that matter, what causes the extra branch-miss.</p>
1185 </li>
1186 <li>
1187 <p>Another possibility is out-of-line code -- frequent interrupts, vmexits etc -- trashing the cache but from profiling these were a non-issue.</p>
1188 </li>
1189 <li>
1190 <p>Measured using the pearson-quotient(x, y): correlation coefficient between quantities x and y.</p>
1191 </li>
1192 <li>
1193 <p>You might notice that this analysis does not address the extra branch-miss. That's because I still have no clue what causes it.</p>
1194 </li>
1195 <li>
1196 <p>The correct fix would be to fix whatever ails the allocator. However, from a quick look at the changes that have gone into related code, it seems non-trivial to find a particular commit which points to the root cause of the skew (especially given that the skew is not constant, but varies from run-to-run.) Also, notably, the fixes described below also apply to UEK5, which means that even if UEK6 becomes faster, UEK5 will also improve somewhat.</p>
1197 </li>
1198 <li>
1199 <p>As mentioned in <a href="#cpu-parameters">CPU-parameters</a>, L1d-loads take 4-6 cycles on Skylake-X. We also know that in the good case (UEK5), this loop is capable of an IPC of 3.41 insn per cycle. So, hiding L1d-load latency is critical for good performance.</p>
1200 </li>
1201 <li>
1202 <p><a href="https://www.agner.org/optimize/instruction_tables.pdf%3E">https://www.agner.org/optimize/instruction_tables.pdf</a>, pg 298 (Skylake-X)</p>
1203 </li>
1204 <li>
1205 <p>The first form fused, needs three inputs: <code style="background:#eeeeee;border:1px solic #cccccc;">%rbx</code>, an <code style="background:#eeeeee;border:1px solic #cccccc;">imm32</code> encoding the distance to the address being compared, and an <code style="background:#eeeeee;border:1px solic #cccccc;">imm8</code> encoding the distance to the branch-dest; the second needs two registers: <code style="background:#eeeeee;border:1px solic #cccccc;">%rbx</code>, <code style="background:#eeeeee;border:1px solic #cccccc;">%r12</code> and only the <code style="background:#eeeeee;border:1px solic #cccccc;">imm8</code>.</p>
1206 </li>
1207 <li>
1208 <p>Just for context, a kernel build (x86-defconfig) makes an aggregate of 27M syscalls, with a syscall every 44us.</p>
1209 </li>
1210 </ol>
1211
1212
1213 </section>
1214 <!-- /RC84v1 -->
1215
1216 <!-- RC84v2 -->
1217 <section class="rc84v2 cpad">
1218 <div class="rc84w1 cwidth">
1219
1220 <div class="rc84bio">
1221 <div class="rc84img">
1222 <img src="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/img/ui_defaultuserimage.jpg" alt="">
1223 </div>
1224 <div class="rc84blurb">
1225 <div class="blogtile-w2-inner text-wrap">
1226 <h4>Ankur Arora</h4>
1227
1228 <p></p>
1229 </div>
1230 </div>
1231 </div>
1232
1233 </div>
1234 </section>
1235 <!-- /RC84v2 -->
1236
1237
1238 </div>
1239
1240 </div>
1241 </section>
1242 <!-- /RC84v0 -->
1243
1244 <!-- /RC83v0 -->
1245 <input type="hidden" name="hiddenField" value="September 12, 2023" id="pubdate">
1246 <input type="hidden" name="hiddenField" value="linux" id="primarychannel">
1247 <div class="hydrate-container" data-hydrate="{"metaItems":[{"translatable":true,"createdDate":{"value":"2023-09-12T15:00:01.308Z","timezone":"UTC"},"fileExtension":"contentItem","name":"Ankur","description":"","language":"en-US","links":[{"href":"https://orasites-prodapp.cec.ocp.oraclecloud.com/content/published/api/v1.1/items/COREF415334566DE45208D79D6CD6FA88629?channelToken=3189ef66cf584820b5b19e6b10792d6f","rel":"self","method":"GET","mediaType":"application/json"}],"id":"COREF415334566DE45208D79D6CD6FA88629","updatedDate":{"value":"2023-09-12T15:00:01.308Z","timezone":"UTC"},"type":"Blog-Author","fields":{"twitter_handle":null,"facebook_url":null,"profile_image":null,"timezone":"EST","last_name":"Arora","bio":null,"linkedin_url":null,"middle_name":null,"first_name":"Ankur","job_title":null,"email":"ankur.a.arora"},"slug":"ankur-arora"}],"contentData":{"featured_image_display_option":null,"featured_image_alt_text":null,"attachments":null,"og_title":"Syscall latency... and some uses of speculative execution","featured_image_alternate_text":"","industry":null,"title":"Syscall latency... and some uses of speculative execution","body":"<!DOCTYPE html> <h2 id=\"introduction\">Introduction</h2>\n\n<p>Moving from UEK5 to UEK6 brought about an unwelcome surprise: an increase in syscall latency on some x86 systems. The root cause, as we will see, was slightly slower evaluation of audit rules, which, given that they are evaluated for every syscall, is not great.</p>\n\n<p>In this post we start off by exploring the root cause which turns out to not be UEK specific, it also impacts upstream kernels as well. Then we detail the fixes and how they take advantage of the speculative out-of-order nature of the CPU pipeline.</p>\n\n<p>The changes, even though they target low-level optimizations, are quite straight-forward, almost trivial.</p>\n\n<h3 id=\"background\">Background</h3>\n\n<p>Execution latency of the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">getpid()</code>[1] increased by about 15% (measured on an Intel Skylake-X system), from 191ns on UEK5, to 217ns on UEK6.</p>\n\n<p>This was measured in the usual way:</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\nclock_gettime(CLOCK_MONOTONIC, &amp;start);\nfor (i = 0; i &lt; large_number; i++)\n syscall(SYS_getpid);\nclock_gettime(CLOCK_MONOTONIC, &amp;stop);</pre>\n\n<p>A quick <code style=\"background:#eeeeee;border:1px solic #cccccc;\">perf record</code>, showed that almost all of the increased latency was in <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code> which was more expensive in UEK6.</p>\n\n<p>Oracle Exadata, where this problem was seen has 37 audit rules that are evaluated in the syscall path. Since audit only wants to log unusual or exceptional events, the benchmark would evaluate these rules in every iteration, but never generate any output. Essentially, purely local computation that became slower without there having been any material changes to the audit code or in the audit rules.</p>\n\n<h3 id=\"cpu-parameters\">CPU-parameters</h3>\n\n<p>Some Intel Skylake-X parameters that we&#39;ll make use of later:</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\nL1-load-latency: 4-6 cycles\nL2-load-latency: 14 cycles \nL1-cache-size: 32K (512 cachelines: 64 sets, 8 ways each)\n\nROB size: 224 micro-ops</pre>\n\n<p>The parameters are taken from the <a href=\"https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html\">Intel SDM</a>.</p>\n\n<p><strong>Note:</strong> L1/L2 being the respective data-cache level and ROB, being the Reorder Buffer, where instructions are staged for in-order retirement.</p>\n\n<h2 id=\"root-cause-analysis\">Root cause analysis</h2>\n\n<p>Drilling down with <code style=\"background:#eeeeee;border:1px solic #cccccc;\">perf stat -d</code>:</p>\n\n<p>UEK5 (191 ns):</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n# perf stat -d -r 5 ./getpid\n# output normalized for a single getpid() call\n\n 677.9 cycles # 3.542 GHz\n1635.0 instructions # 2.40 insn per cycle\n 325.0 branches\n 0.5 branch-misses # 0.16% of all branches\n 404.0 L1-dcache-loads\n 0.4 L1-dcache-load-misses # 0.10% of all L1-dcache accesses</pre>\n\n<p>UEK6 (217ns):</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n# perf stat -d -r 5 ./getpid\n# output normalized for a single getpid() call\n\n 770.4 cycles # 3.545 GHz\n1652.0 instructions # 2.14 insn per cycle\n 332.2 branches\n 1.5 branch-misses # 0.45% of all branches\n 407.3 L1-dcache-loads\n 8.6 L1-dcache-load-misses # 2.13% of all L1-dcache accesses</pre>\n\n<p>Comparing, this is an increase of ~100 cycles with the L1d-loads and instruction counts being almost identical across UEK5 and UEK6. This underscores the fact that audit code which forms the bulk of instructions executed hasn&rsquo;t changed all that much.</p>\n\n<p>The IPC is commensurately lower[2]. The proximal cause seems to be the increased L1d-load-misses and the one extra branch-miss.</p>\n\n<p>These observations were confirmed via enough non-correlated runs (with intervening reboot for each) and so are statistically significant. The L1d-load-miss numbers are somewhat variable across boot cycles, but the trend is close to what we see above.</p>\n\n<h3 id=\"audit_filter_syscall\"><code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code></h3>\n\n<p>From <code style=\"background:#eeeeee;border:1px solic #cccccc;\">perf record</code> we know that the bulk of the increased runtime went to <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code>. The procedure itself is primarily a loop that walks the list of rules, calling <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_in_mask()</code> for each rule to check if it needs to be evaluated for the current syscall. For <code style=\"background:#eeeeee;border:1px solic #cccccc;\">getpid()</code> the answer will be <code style=\"background:#eeeeee;border:1px solic #cccccc;\">false</code> most of the time (32 of 37 times.)</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\naudit_filter_syscall(...) {\n struct audit_entry *e; \n struct audit_entry *ctx;\n\n list = audit_filter_list[AUDIT_FILTER_EXIT]; \n\n list_for_each_entry_rcu(e, list, list) {\n\n if (audit_in_mask(&amp;e-&gt;rule, ctx-&gt;major) &amp;&amp; \n audit_filter_rules(tsk, &amp;e-&gt;rule, ctx, NULL,\n &amp;state, false, x)) { \n rcu_read_unlock(); \n ctx-&gt;current_state = state;\n return state;\n }\n }\n\n\n}\n \naudit_in_mask(const struct audit_krule *rule, unsigned long val) {\n if (val &gt; 0xffffffff)\n return false; \n\n /*\n * val contains the current syscall number. AUDIT_WORD does\n * some bit shifting on it.\n */\n word = AUDIT_WORD(val);\n if (word &gt;= AUDIT_BITMASK_SIZE)\n return false;\n\n bit = AUDIT_BIT(val);\n\n /*\n * The load in rule-&gt;mask[word] depends on the audit_krule (which\n * hangs off the current rule entry) and the syscall number.\n */\n return rule-&gt;mask[word] &amp; bit;\n}\naudit_filter_rules(...) {\n /*\n * Large switch statement which we ignore for the rest of this\n * analysis because, as we will see later, loads executed in it don&#39;t\n * have an &quot;interesting&quot; alignment and so their latency should be easy\n * enough to hide.\n */\n}</pre>\n\n<h3 id=\"memory-accesses\">Memory accesses</h3>\n\n<p>Next let&rsquo;s look at the data structures accessed in the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code> loop and where the L1d-load-misses might be coming from.</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n/* Data structure layout annotated with size and cacheline occupancy\n * information using pahole. */\n\nstruct audit_entry { /* via audit_filter_list[AUDIT_FILTER_EXIT] */\n\n struct list_head list; /* 0 16 */\n struct callback_head rcu; /* 16 16 */\n struct audit_krule rule; /* 32 376 */\n ...\n /* size: 408, cachelines: 7, members: 3 */\n /* last cacheline: 24 bytes */\n};\n\nstruct audit_krule { /* inlined in struct audit_entry */\n ...\n u32 mask[64]; /* 16 256 */\n ...\n /* size: 376, cachelines: 6, members: 17 */\n /* last cacheline: 56 bytes */\n};\n\nstruct audit_context {\n ...\n int major; /* 20 4 */\n ...\n /* size: 920, cachelines: 15, members: 46 (slightly larger on UEK6) */\n /* sum members: 912, holes: 2, sum holes: 8 */\n /* last cacheline: 24 bytes */\n};</pre>\n\n<p>The effective execution loop in <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code> (with cacheline access annotations):</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\nstruct audit_entry *e = &amp;audit_filter_list[AUDIT_FILTER_EXIT];\n\nfor_each_iteration {\n e = e-&gt;next; /* cacheline-0 of audit_entry */\n if (e == list)\n jmp out;\n if (audit_in_mask(e-&gt;rule.mask, /* cacheline-0 of audit_entry */\n ctx-&gt;major)) /* cacheline-0 of audit_context */\n audit_filter_rules(e-&gt;rule);\n}\nout:</pre>\n\n<p>As the annotations above mention, there are a total of three loads:</p>\n\n<ol type=\"1\">\n\t<li>Pointer chasing in <code style=\"background:#eeeeee;border:1px solic #cccccc;\">e-&gt;next</code>: the first cacheline of <code style=\"background:#eeeeee;border:1px solic #cccccc;\">struct audit_entry</code>.</li>\n\t<li><code style=\"background:#eeeeee;border:1px solic #cccccc;\">e-&gt;rule.mask[]</code>: accesses the same cacheline as load (1) above.</li>\n\t<li><code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code>: accesses the first cacheline of <code style=\"background:#eeeeee;border:1px solic #cccccc;\">struct audit_context</code>.</li>\n</ol>\n\n<p>Loads (1) and (2) will access a total of 37 cachelines, corresponding to a rule per iteration. Also notice that every single basic block in the rest of the iteration (apart from some error checking in <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_in_mask()</code>) has data dependence on the evaluation of <code style=\"background:#eeeeee;border:1px solic #cccccc;\">e=e-&gt;next</code>. Worse this is a loop carried dependency, so each iteration depends on the previous one.</p>\n\n<p>The cacheline for load (3) is accessed once every iteration. This load is unnecessary, <code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code> contains the syscall number, which is a constant for the duration of the syscall. However, because the compiler&rsquo;s alias analysis cannot prove that <code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code> is not mutilated, it does not get cached in a register. This also means that <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_in_mask()</code> will do out-of-bound validation checks related to <code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code> over and over.</p>\n\n<p>Recalling the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">perf-stat -d</code> output above there are a total of around 400 L1d-loads for each <code style=\"background:#eeeeee;border:1px solic #cccccc;\">getpid()</code> call. Of those, the loop does a total of 37*3 loads which map to a total of 38 unique cachelines.</p>\n\n<p>Alright, I hear you think: granted, walking linked-lists is difficult, there are a lot of cachelines in a lot of iterations or whatever, life is hard and the compiler doesn&rsquo;t know what it is doing[3]. Even given all of that, nothing here has changed from UEK5 to UEK6, so none of this explains why UEK6 would incur more L1d-load-misses[4].</p>\n\n<p>Which is true, so that&rsquo;s next.</p>\n\n<h3 id=\"theory-of-the-case\">Theory of the case</h3>\n\n<p>From the background above, we know that the loop is pure computation, and purely local computation at that, so code changes elsewhere should have no effect. And there were no significant code changes from UEK5 to UEK6, so the loop is unchanged (which also applies to the generated assembly.)</p>\n\n<p>Now insofar as L1d-load-misses are concerned: the number of cachelines accessed (from about 400 L1d-loads per <code style=\"background:#eeeeee;border:1px solic #cccccc;\">getpid()</code> call, not all of which are to unique cachelines) amount to a number comfortably below the Skylake-X L1d-cache capacity of 512 cachelines. So this loop should not incur any capacity misses.</p>\n\n<p>Which leaves conflict misses as the probable cause[5]. Skylake-X has an 8-way associative L1: if more than 8 loads in the loop map to the same cache-set some accesses would incur conflict misses.</p>\n\n<p>Accesses in the loop and how they map to cache-sets:</p>\n\n<ul>\n\t<li><code style=\"background:#eeeeee;border:1px solic #cccccc;\">struct audit_entry</code>: aligns at a 512B boundary, which limits it to cache-sets <code style=\"background:#eeeeee;border:1px solic #cccccc;\">{0, 8, 16, ... 56}</code>, for a total of 8*8 cache-slots.</li>\n\t<li><code style=\"background:#eeeeee;border:1px solic #cccccc;\">struct audit_context</code>: aligns at a 1024B boundary, which resolves to cache-sets <code style=\"background:#eeeeee;border:1px solic #cccccc;\">{0, 16, 32, 48}</code>, for a total of 4*8 cache-slots. As described earlier, this is a single cacheline which competes with a subset of the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">struct audit_entry</code> cachelines.</li>\n</ul>\n\n<p>Even then, this is 37 cachelines slotted into 64 slots and another slotting into 32 of those 64. This should be easy enough to satisfy, assuming that the kernel allocator has a reasonably sane distribution and isn&rsquo;t skewed towards a particular set of cachelines (or is similarly skewed on both UEK5 and UEK6.)</p>\n\n<h3 id=\"allocation-skew\">Allocation skew</h3>\n\n<p>If, allocations for <code style=\"background:#eeeeee;border:1px solic #cccccc;\">struct audit_entry</code> were distributed uniformly, they would map into cache-sets uniformly, ending with similar populations across the cache-sets. This would give a cacheline-spread metric of ~0 (obtained by calculating the standard-deviation of populations across cache-sets.)</p>\n\n<p>What we see:</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\ncacheline-spread on UEK5: 1.58\ncacheline-spread on UEK6: 1.91</pre>\n\n<p>(These results are from a large number (&gt; 100) of non-correlated runs. <code style=\"background:#eeeeee;border:1px solic #cccccc;\">auditd</code> allocates at boot, so this was done by rebooting between each run.)</p>\n\n<p>From these numbers, UEK5 is far from a flat distribution, and UEK6 is somewhat worse, but not dispositively so. Additionally, a slight imbalance will not cause performance degradation: that happens only after cache conflicts kick in, which is after cache-set population crosses the associativity threshold.</p>\n\n<p>To validate this, we measure how well cycles correlate[6] with 1) with L1d-misses, and 2) cacheline-spread:</p>\n<style type=\"text/css\">.divTable {\n display: table;\n width: 80%;\n}\n.divTableRow {\n display: table-row;\n}\n.divTableHeading {\n display: table-header-group;\n background-color: #ddd;\n font-weight: bold;\n}\n.divTableCell {\n display: table-cell;\n padding: 3px 10px;\n border: 1px solid #999999;\n}\n</style>\n<p>&nbsp;</p>\n\n<div class=\"divTable\">\n<div class=\"divTableHeading\">\n<div class=\"divTableCell\">Kernel</div>\n\n<div class=\"divTableCell\">cycles:L1d-misses</div>\n\n<div class=\"divTableCell\">cycles:cacheline-spread</div>\n</div>\n\n<div class=\"divTableRow\">\n<div class=\"divTableCell\">UEK5</div>\n\n<div class=\"divTableCell\">0.74</div>\n\n<div class=\"divTableCell\">0.22</div>\n</div>\n\n<div class=\"divTableRow\">\n<div class=\"divTableCell\">UEK6</div>\n\n<div class=\"divTableCell\">0.74</div>\n\n<div class=\"divTableCell\">0.61</div>\n</div>\n</div>\n\n<p>&nbsp;</p>\n\n<p>For both UEK5 and UEK6, &ldquo;cycles:L1d-misses&rdquo; is tightly correlated (though the value of 0.74 for both is happenstance) which makes sense. &ldquo;cycles:cacheline-spread&rdquo;, however, is well correlated only on UEK6, not UEK5. This suggests that the UEK6 allocator skew is meaningfully worse, enough to cause lower performance.</p>\n\n<p>Alright, having beaten this dead horse enough, let&rsquo;s figure out how to fix it next[7].</p>\n\n<h2 id=\"speeding-it-up\">Speeding it up</h2>\n\n<p>To get back our lost performance, our task is simple: optimize a hot-loop[8] which is itself executed in the hot syscall path. Compounding the problem, the critical load in the loop is accessed via a linked list.</p>\n\n<p>Stated like that, it sounds pretty bad. But, as we will see the structure of the problem helps quite a bit:</p>\n\n<ol type=\"1\">\n\t<li>On a sane system, the common-case is extremely common, syscalls are frequent, and audit logging is unusual. This means that low branch mispreds are not unusual and something we might even depend on.</li>\n\t<li>We are optimizing a no-op loop: the loop walks a bunch of rules, does error checking, and decides if it needs to log. In the common-case, it will conclude that it doesn&rsquo;t. (This is really (1) restated to stress the no-op nature of the loop.)</li>\n</ol>\n\n<p>A no-op loop implies that the code does not actually care about most of the values it computes. It just inches towards a foregone conclusion.</p>\n\n<p>This it does (as all code does) by means of dependency chains that transform the input state to output. Here, most dependency chains are short and, are really <em>only used to predict the control flow</em>. The only long dependency chain, woven through all the loop iterations, is the one walking the linked-list.</p>\n\n<p>Now, critically since the branches are predicted perfectly or almost so, the control flow can run quite a bit further than any loads and dependent computation. The control flow thus essentially feeds these loads and other instructions to the ROB, where they wait until resources/dependencies become available, compute the output from their chain which, to reiterate, will only be used to predict the control flow.</p>\n\n<p>Given that the control flow is already feeding instructions from the correct direction, these are in effect orphan chains that eventually retire without anyone having cared for the output they compute or how long that took.</p>\n\n<p>Except: this happy state continues only until we run into a resource constraint. For instance, the size of the ROB on Skylake-X is 224 entries and each loop iteration is ~20 instructions. This means instructions worth around 10 loop iterations can be present in the ROB. Now, given that instructions retire on x86 in-order, long running instructions (L1d-load-misses of course, but also L1d-load hits[9]) with long dependence chains would slow retirement down, even were control-flow to be predicted perfectly.</p>\n\n<p>Bearing these observations in mind, our fixes will try to reduce the amount and cost of work per loop iteration. This allows the loop to retire as close to the gating latency of any long running instructions in the loop.</p>\n\n<h3 id=\"cache-ctx-major-in-audit_filter_syscall\">Cache <code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code> in <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code></h3>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n@@ -785,13 +785,14 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,\n {\n struct audit_entry *e;\n enum audit_state state;\n+ unsigned long major = ctx-&gt;major;\n\n if (auditd_test_task(tsk))\n return AUDIT_DISABLED;\n\n rcu_read_lock();\n list_for_each_entry_rcu(e, list, list) {\n- if (audit_in_mask(&amp;e-&gt;rule, ctx-&gt;major) &amp;&amp;\n+ if (audit_in_mask(&amp;e-&gt;rule, major) &amp;&amp;\n audit_filter_rules(tsk, &amp;e-&gt;rule, ctx, NULL,\n &amp;state, false)) {\n rcu_read_unlock();</pre>\n\n<p>Caching <code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code> in a local variable helps in two ways:</p>\n\n<ul>\n\t<li>Explicitly indicates to the compiler that there are no stores to the cached value. <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_in_mask()</code> operates on <code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code> doing some bit-shifting and error checking. Now that the compiler knows that <code style=\"background:#eeeeee;border:1px solic #cccccc;\">major</code> is not modified, it can hoist most of that logic out of the loop so it is not reevaluated over-and-over in every loop iteration.</li>\n\t<li>As described earlier, <code style=\"background:#eeeeee;border:1px solic #cccccc;\">struct audit_context</code> has similar natural alignment concerns as <code style=\"background:#eeeeee;border:1px solic #cccccc;\">struct audit_entry</code>. Allowing the compiler to cache <code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code> in a register (or on the stack) reduces one potential source of contention.</li>\n</ul>\n\n<p>With this change the number of instructions executed/loop-iteration reduce by 8 (of 20.) Note that most of those were almost free ALU instructions.</p>\n\n<p>L1d-loads: we removed one L1d-load but added two (due to the compiler now spilling and reloading some state to/from the stack.) However, given that stack accesses are much less likely to have conflicting alignment constraints, the increased loads are less of a concern than the one we got rid of.</p>\n\n<p>cycles: improve by about 40 cycles. This is because the greater room in the ROB allows our almost perfect branch prediction to speculatively run even further ahead of other instructions.</p>\n\n<p>Change in latency for UEK6:</p>\n\n<div class=\"divTable\">\n<div class=\"divTableHeading\">\n<div class=\"divTableCell\">Version</div>\n\n<div class=\"divTableCell\">Min<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Mean<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Median<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Max<br />\n(ns)</div>\n</div>\n\n<div class=\"divTableRow\">\n<div class=\"divTableCell\">baseline</div>\n\n<div class=\"divTableCell\">196.26</div>\n\n<div class=\"divTableCell\">212.00</div>\n\n<div class=\"divTableCell\">207.80</div>\n\n<div class=\"divTableCell\">240.52</div>\n</div>\n\n<div class=\"divTableRow\">\n<div class=\"divTableCell\">ctx-&gt;major</div>\n\n<div class=\"divTableCell\">183.50</div>\n\n<div class=\"divTableCell\">201.41</div>\n\n<div class=\"divTableCell\">198.80</div>\n\n<div class=\"divTableCell\">226.93</div>\n</div>\n</div>\n\n<p>&nbsp;</p>\n\n<p>From the min-max range, there is a rather large variation in latency that&rsquo;s caused by variations in allocation resulting in high or low cacheline-spread. In almost all cases though, the latency improves by ~10ns or thereabouts.</p>\n\n<p>That said, after removing 8 instructions and one load (and adding two less consequential loads), the performance gain is rather miniscule: ~1 cycle/iteration. Just that the loop executes 37 times, so we make it up in volume.</p>\n\n<p>More details (<code style=\"background:#eeeeee;border:1px solic #cccccc;\">perf-stat</code> and the before/after versions of the generated code) in <a href=\"https://github.com/oracle/linux-uek/commit/87a39a3d2ca9a5c7e4d35e4cf4b839c53cc0678d\">UEK6 commit-1</a> and in <a href=\"https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/kernel/auditsc.c?id=069545997510833281f45f83e097017b9fef19b7\">Upstream commit-1</a>.</p>\n\n<h3 id=\"annotate-branch-direction-for-audit_in_mask\">Annotate branch direction for <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_in_mask()</code></h3>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n@@ -790,12 +790,13 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,\n rcu_read_lock();\n list_for_each_entry_rcu(e, list, list) {\n - if (audit_in_mask(&amp;e-&gt;rule, major) &amp;&amp;\n - audit_filter_rules(tsk, &amp;e-&gt;rule, ctx, NULL,\n - &amp;state, false)) {\n ...\n + if (unlikely(audit_in_mask(&amp;e-&gt;rule, major))) {\n + if (audit_filter_rules(tsk, &amp;e-&gt;rule, ctx, NULL,\n + &amp;state, false)) {</pre>\n\n<p>Annotate <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_in_mask()</code> as <code style=\"background:#eeeeee;border:1px solic #cccccc;\">unlikely()</code> to allow the compiler to pessimize the call to <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_rules()</code>. Two reasons for this change:</p>\n\n<ul>\n\t<li>The primary motivation was to get rid of the extra branch mispred. This change succeeds in that task but it is unclear why: there&rsquo;s no significant change in the basic-block structure. The only change is from a branch inversion due to the unlikely clause.</li>\n\t<li>The branch inversion means that the not-taken direction is chosen more often: 32/37 times (changing from 5/37 earlier.) The issue-latency for not-taken branches is 0.5-1 cycles, for taken branches 1-2 cycles[10] is slightly cheaper.</li>\n</ul>\n\n<p>L1d-loads: reduce by 2 for each loop iteration. This is because the spills and reloads introduced in the &ldquo;Cache <code style=\"background:#eeeeee;border:1px solic #cccccc;\">ctx-&gt;major</code>&hellip;&rdquo; patch have now been shifted to the unlikely path (the prologue and epilogue of the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_rules()</code> call.)</p>\n\n<p>cycles: performance improves on average by ~30 cycles/call.</p>\n\n<p>Change in latency for UEK6:</p>\n\n<div class=\"divTable\">\n<div class=\"divTableHeading\">\n<div class=\"divTableCell\">Version</div>\n\n<div class=\"divTableCell\">Min<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Mean<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Median<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Max<br />\n(ns)</div>\n</div>\n\n<div class=\"divTableRow\">\n<div class=\"divTableCell\">ctx-&gt;major</div>\n\n<div class=\"divTableCell\">183.50</div>\n\n<div class=\"divTableCell\">201.41</div>\n\n<div class=\"divTableCell\">198.80</div>\n\n<div class=\"divTableCell\">226.93</div>\n</div>\n\n<div class=\"divTableRow\">\n<div class=\"divTableCell\">ctx-&gt;major+annot</div>\n\n<div class=\"divTableCell\">165.26</div>\n\n<div class=\"divTableCell\">188.72</div>\n\n<div class=\"divTableCell\">184.25</div>\n\n<div class=\"divTableCell\">230.34</div>\n</div>\n</div>\n\n<p>&nbsp;</p>\n\n<p>More details (<code style=\"background:#eeeeee;border:1px solic #cccccc;\">perf-stat</code> and the before/after versions of the generated code) in <a href=\"https://github.com/oracle/linux-uek/commit/0288dbdbfb5768ad8ae8a445c72f523bcb99eca0\">UEK6 commit-2</a>.</p>\n\n<h3 id=\"remove-static-linkage-from-audit_filter_syscall\">Remove static linkage from <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code></h3>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n@@ -777,7 +777,7 @@ static bool audit_in_mask(const struct audit_krule *rule, unsigned long\n * also not high enough that we already know we have to write an audit\n * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).\n */\n -static enum audit_state audit_filter_syscall(struct task_struct *tsk,\n +enum audit_state audit_filter_syscall(struct task_struct *tsk,\n struct audit_context *ctx,\n struct list_head *list)</pre>\n\n<p><code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code> is only used locally in the file and so is marked <code style=\"background:#eeeeee;border:1px solic #cccccc;\">static</code>. Additionally, it&rsquo;s only ever called with a fixed <code style=\"background:#eeeeee;border:1px solic #cccccc;\">list</code> value of <code style=\"background:#eeeeee;border:1px solic #cccccc;\">&amp;audit_filter_list[AUDIT_FILTER_EXIT])</code>.</p>\n\n<p>GCC&rsquo;s constant propagation pass makes use of these two things to, quite reasonably, const-propagate the third argument to the point of use.</p>\n\n<p>This causes the exit check in the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">list_for_each</code> loop to look like this:</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\naudit_filter_syscall.constprop.18(task, ctx):\n 0: 48 8b 1b mov (%rbx),%rbx\n 3: 48 81 fb e0 67 ac 82 cmp $0xffffffff82ac67e0,%rbx\n ffffffff8118b5ed: R_X86_64_32S audit_filter_list+0x40\n 10: 75 e2 jne start_iter</pre>\n\n<p>while, without const-propagation it would have looked like this:</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\naudit_filter_syscall(task, ctx, list):\n 0: 48 8b 1b mov (%rbx),%rbx\n 3: 4c 39 e3 cmp %r12,%rbx\n 6: 75 e6 jne start_iter</pre>\n\n<p>Now either one ought to be alright, both <code style=\"background:#eeeeee;border:1px solic #cccccc;\">cmp imm32,r</code> and <code style=\"background:#eeeeee;border:1px solic #cccccc;\">cmp r,r</code> forms are equivalent with a latency of 1 cycle, and both are a single micro-op each.</p>\n\n<p>The second form of the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">cmp</code>, however, can be macro-op fused with the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">jne</code>; not entirely sure if the first form can be[11]. The second form is also denser, though that&rsquo;s not a concern here.</p>\n\n<p>Disallowing GCC from making assumptions about calling contexts by removing the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">static</code> linkage from <code style=\"background:#eeeeee;border:1px solic #cccccc;\">audit_filter_syscall()</code> forces it to pass the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">list</code> parameter in a register which results in a small performance improvement: ~20 cycles (about 0.5 cycles/loop iteration.)</p>\n\n<p>Change in latency for UEK6:</p>\n\n<div class=\"divTable\">\n<div class=\"divTableHeading\">\n<div class=\"divTableCell\">Version</div>\n\n<div class=\"divTableCell\">Min<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Mean<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Median<br />\n(ns)</div>\n\n<div class=\"divTableCell\">Max<br />\n(ns)</div>\n</div>\n\n<div class=\"divTableRow\">\n<div class=\"divTableCell\">ctx-&gt;major+annot</div>\n\n<div class=\"divTableCell\">165.26</div>\n\n<div class=\"divTableCell\">188.72</div>\n\n<div class=\"divTableCell\">184.25</div>\n\n<div class=\"divTableCell\">230.34</div>\n</div>\n\n<div class=\"divTableRow\">\n<div class=\"divTableCell\">ctx-&gt;major+annot+extern</div>\n\n<div class=\"divTableCell\">159.88</div>\n\n<div class=\"divTableCell\">184.35</div>\n\n<div class=\"divTableCell\">177.62</div>\n\n<div class=\"divTableCell\">250.82</div>\n</div>\n</div>\n\n<p>&nbsp;</p>\n\n<p>More details (<code style=\"background:#eeeeee;border:1px solic #cccccc;\">perf-stat</code> and the before/after versions of the generated code) in <a href=\"https://github.com/oracle/linux-uek/commit/5a74015e20bff63d1052359fbc2c3418e0f6bc4e\">UEK6 commit-3</a> and, <a href=\"https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/kernel/auditsc.c?id=50979953c0c41e929e5f955800da68e1bb24c7ab\">Upstream commit-3</a>.</p>\n\n<h2 id=\"summary\">Summary</h2>\n\n<p>The audit subystem is fairly stable in the Linux kernel, not given to frequent changes. So it was puzzling when it became slower in recent kernels, and because a primary user is the syscall path, concerning[12].</p>\n\n<p>The cause turned out to be higher skew in allocated buffers which results in more lopsided cache-set distribution.</p>\n\n<p>The fixes compensate for the higher costs in the loop by taking advantage of the peculiarities of the execution path and optimizing for the speculative nature of the CPU pipeline.</p>\n\n<p>The three patches, in sum reduce the overhead by about 30ns (~100 cycles).</p>\n\n<p>Final <code style=\"background:#eeeeee;border:1px solic #cccccc;\">perf stat -d -r 5</code> go from:</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n# perf stat -d -r 5 ./getpid\n# output normalized for a single getpid() call\n\ncycles 761.65 ( +- 5.22% )\ninstructions 1639.17 ( +- 0.00% )\nIPC 2.18 ( +- 5.50% )\nbranches 328.21 ( +- 0.00% )\nbranch-misses 1.37 ( +- 6.56% )\nL1-dcache-loads 404.35 ( +- 0.00% )\nL1-dcache-load-misses 7.99 ( +- 70.71% )</pre>\n\n<p>to:</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n# perf stat -d -r 5 ./getpid\n# output normalized for a single getpid() call\n\ncycles 669.09 ( +- 11.23% )\ninstructions 1342.04 ( +- 0.00% )\nIPC 2.03 ( +- 9.85% )\nbranches 328.19 ( +- 0.00% )\nbranch-misses 0.56 ( +- 5.35% )\nL1-dcache-loads 384.31 ( +- 0.00% )\nL1-dcache-load-misses 5.77 ( +- 84.57% )</pre>\n\n<p>This compares quite well to the UEK5-baseline:</p>\n\n<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\n# perf stat -d -r 5 ./getpid\n# output normalized for a single getpid() call\n\ncycles 672.90 ( +- 1.65% )\ninstructions 1622.08 ( +- 0.00% )\nIPC 2.41 ( +- 1.65% )\nbranches 321.20 ( +- 0.00% )\nbranch-misses 0.51 ( +- 0.00% )\nL1-dcache-loads 401.32 ( +- 0.00% )\nL1-dcache-load-misses 2.28 ( +- 59.62% )</pre>\n\n<p>Note for non-Skylake-X architectures: Intel Icelake and AMD Milan (the other architectures tested) cope with L1d-load-misses much better so the baseline performance is much better.</p>\n\n<p>With these patches, they only show a small improvement (~10ns): Icelake has a bigger L1d-cache (48K), and a much bigger ROB. Milan also has a bigger ROB and does memory renaming and bunch of other pipeline optimizations that limit the effect of these optimizations.</p>\n\n<p><strong>Endnote:</strong> what I found personally instructive was how much C really is &ldquo;a portable assembler&rdquo; and the significant codegen (and performance) changes that can result from minimal changes to the code.</p>\n\n<h2 id=\"references\">References</h2>\n\n<ol type=\"1\">\n\t<li>\n\t<p><code style=\"background:#eeeeee;border:1px solic #cccccc;\">getpid()</code> has a minimal kernel execution path (only does a PID lookup), and so is generally used to measure the overhead of the syscall path.</p>\n\t</li>\n\t<li>\n\t<p>Comparing the IPC for the audit-only portion shows that a starker drop:</p>\n\n\t<pre class=\"brush: bash;\" style=\"background:#eeeeee;border:1px solid #cccccc;padding:5px 10px;\">\nUEK5: 1427.0 instructions # 3.41 insn per cycle\nUEK6: 1432.0 instructions # 2.84 insn per cycle</pre>\n\t</li>\n\t<li>\n\t<p>Alas no, alias analysis is an undecidable problem.</p>\n\t</li>\n\t<li>\n\t<p>Or for that matter, what causes the extra branch-miss.</p>\n\t</li>\n\t<li>\n\t<p>Another possibility is out-of-line code -- frequent interrupts, vmexits etc -- trashing the cache but from profiling these were a non-issue.</p>\n\t</li>\n\t<li>\n\t<p>Measured using the pearson-quotient(x, y): correlation coefficient between quantities x and y.</p>\n\t</li>\n\t<li>\n\t<p>You might notice that this analysis does not address the extra branch-miss. That&#39;s because I still have no clue what causes it.</p>\n\t</li>\n\t<li>\n\t<p>The correct fix would be to fix whatever ails the allocator. However, from a quick look at the changes that have gone into related code, it seems non-trivial to find a particular commit which points to the root cause of the skew (especially given that the skew is not constant, but varies from run-to-run.) Also, notably, the fixes described below also apply to UEK5, which means that even if UEK6 becomes faster, UEK5 will also improve somewhat.</p>\n\t</li>\n\t<li>\n\t<p>As mentioned in <a href=\"#cpu-parameters\">CPU-parameters</a>, L1d-loads take 4-6 cycles on Skylake-X. We also know that in the good case (UEK5), this loop is capable of an IPC of 3.41 insn per cycle. So, hiding L1d-load latency is critical for good performance.</p>\n\t</li>\n\t<li>\n\t<p><a href=\"https://www.agner.org/optimize/instruction_tables.pdf%3E\">https://www.agner.org/optimize/instruction_tables.pdf</a>, pg 298 (Skylake-X)</p>\n\t</li>\n\t<li>\n\t<p>The first form fused, needs three inputs: <code style=\"background:#eeeeee;border:1px solic #cccccc;\">%rbx</code>, an <code style=\"background:#eeeeee;border:1px solic #cccccc;\">imm32</code> encoding the distance to the address being compared, and an <code style=\"background:#eeeeee;border:1px solic #cccccc;\">imm8</code> encoding the distance to the branch-dest; the second needs two registers: <code style=\"background:#eeeeee;border:1px solic #cccccc;\">%rbx</code>, <code style=\"background:#eeeeee;border:1px solic #cccccc;\">%r12</code> and only the <code style=\"background:#eeeeee;border:1px solic #cccccc;\">imm8</code>.</p>\n\t</li>\n\t<li>\n\t<p>Just for context, a kernel build (x86-defconfig) makes an aggregate of 27M syscalls, with a syscall every 44us.</p>\n\t</li>\n</ol>\n","translate":false,"og_description":"An in-depth exploration into why syscall latency increased on some x86 kernels in recent kernels. ","featuredpost":false,"audience":null,"product":null,"comments":true,"meta_title":"Syscall latency... and some uses of speculative execution","time_to_read":23,"author":[{"id":"COREF415334566DE45208D79D6CD6FA88629","type":"Blog-Author","typeCategory":"ContentType","links":[]}],"translated-pages":null,"canonical_url":null,"featured_image":{"id":"CONTCF8836A82B014903A5283C76DE901346","type":"DigitalAsset","typeCategory":"DigitalAssetType","links":[]},"og_image":{"id":"CONTCF8836A82B014903A5283C76DE901346","type":"DigitalAsset","typeCategory":"DigitalAssetType","links":[]},"meta_description":"An in-depth exploration into why syscall latency increased on some x86 kernels in recent kernels. ","meta_robots":"index, follow","primary_channel":"linux","globalhomepagefeaturedpost":false,"publish_date":{"value":"2023-09-12T15:00:00.000Z","timezone":"UTC","formated":"September 12, 2023"},"desc":"An in-depth exploration into why syscall latency increased on some x86 kernels in recent kernels. ","author_id":"CORE8B88E20204C04A0DADCEBC0499683C49","categories":[{"category":"Technologies","pageUrl":"../category/lnx-technologies"},{"category":"Linux Kernel Development","pageUrl":"../category/lnx-linux-kernel-development"}]}}"></div>
1248 <!-- -->
1249 <script>
1250 window.SCSMacros = window.SCSMacros || {};
1251 var pubdate = document.getElementById("pubdate").value;
1252 window.SCSMacros.getCreationdateMacro = pubdate;
1253
1254
1255 </script>
1256 <script>
1257 if (SCS && SCS['siteId'] === "Blogs-Home") {
1258 var Primarychannel = document.getElementById("primarychannel").value;
1259 if(Primarychannel != 'undefined' && Primarychannel != 'Blogs-Home'){
1260 var slug = window.location.pathname.split('/')[2];
1261 var domain = window.location.origin;
1262 if(domain === "https://blogs.oracle.com"){
1263 window.location.replace("https://blogs.oracle.com/"+Primarychannel+"/post/"+slug);
1264 }else{
1265 window.location.replace("https://blogs-stage.oracle.com/"+Primarychannel+"/post/"+slug);
1266 }
1267 }
1268 }
1269 </script>
1270
1271 <script type="text/javascript">
1272 /*! ORACLE - TRACKING URL */
1273 $(document).ready(function() {
1274 const oracleLinks = document.querySelectorAll('a[href*="go.oracle.com"][data-trackas]');
1275
1276 oracleLinks.forEach(link => {
1277 const url = new URL(link.href);
1278
1279 if (!url.searchParams.has('source') && !url.searchParams.has('src1')) {
1280 const currentUrlParams = new URLSearchParams(window.location.search);
1281 const existingUrlParams = new URLSearchParams(url.search);
1282
1283 if (currentUrlParams.has('source')) {
1284 currentUrlParams.set('src1', currentUrlParams.get('source'));
1285 currentUrlParams.delete('source');
1286 }
1287
1288 let mergedParams = existingUrlParams.toString();
1289 if (mergedParams) {
1290 mergedParams += '&';
1291 } else {
1292 mergedParams += '?';
1293 }
1294 mergedParams += currentUrlParams.toString();
1295
1296 url.search = mergedParams;
1297
1298 link.href = decodeURIComponent(url.toString());
1299
1300 }
1301 });
1302 });
1303 </script>
1304
1305 </div>
1306 </div>
1307 </div>
1308 </div>
1309 </div>
1310 </div></div></div></div></div>
1311 <!-- <div class="scs-slot" id="category-id"></div> -->
1312 <!-- RH03v5 -->
1313 <div id="Next-Previous-Posts" class="scs-slot"><div class="scs-row"><div class="scs-col" style="width: 100%;"><div id="a057a3dc-2397-4b35-88dc-e9904a3f1789"><div class="scs-component-bounding-box"><!-- -->
1314 <div>
1315 <div class="scs-custom-component scs-component scs-component-default-style" style="margin-top:5px;margin-right:5px;margin-bottom:5px;margin-left:5px;">
1316 <div class="scs-component-content" style="width:100%;">
1317 <div style="" class="scs-custom-component-wrapper">
1318 <div id="a057a3dc-2397-4b35-88dc-e9904a3f1789customComponentDiv" data-scs-hydrate="true" data-scs-contenttype="Blog-Post" data-asset-operation="view:CORE8B88E20204C04A0DADCEBC0499683C49">
1319 <section class="rc83 rc83v0 rw-neutral-00bg cpad xwidth">
1320 <div class="rc83w1 cwidth">
1321 <div class="rc83pagenav">
1322 <div class="rc83nav-lt">
1323 <a href="oracle-linux-automation-manager-21" class="rc83arrow-lt">
1324 <div class="icn-img icn-chevron-left"><br></div>
1325 <p id="PreviousPostText">Previous Post</p>
1326 </a>
1327 <h4>Discover the Latest Advancements in Automation with Oracle Linux Automation Manager 2.1</h4>
1328 <div class="rc83sub">
1329 <span><a href="/authors/monica-s">Monica S</a> | </span><span>3</span><span> min read</span>
1330 </div>
1331 </div>
1332 <div class="rc83nav-rt">
1333 <a href="get-inspired-at-oracle-cloudworld-2023hear-from-customers-technical-industry-experts-and-executives-and-get-your-questions-answered" class="rc83arrow-rt">
1334 <p id="NextPostText">Next Post</p>
1335 <div class="icn-img icn-chevron-right"><br></div>
1336 </a>
1337 <h4>Get inspired at Oracle CloudWorld 2023—hear from customers, technical industry experts, and executives and get your questions answered</h4>
1338 <div class="rc83sub">
1339 <span><a href="/authors/michele-resta">Michele Resta</a> | </span><span>3</span><span> min read</span>
1340 </div>
1341 </div>
1342 </div>
1343 </div>
1344 </section>
1345
1346 </div>
1347 </div>
1348 </div>
1349 </div>
1350 </div>
1351 </div></div></div></div></div>
1352 <!-- <div class="scs-slot" id="recent-posts"></div> -->
1353 <!-- U10v6 -->
1354 <div class="u10 u10v6" data-trackas="ffooter" data-ocomid="redwood">
1355
1356 <div class="u10w1">
1357
1358 <div class="u10w2">
1359 <div class="u10w3">
1360 <h5>Resources for</h5>
1361 <ul>
1362 <li><a href="https://www.oracle.com/corporate/">About</a></li>
1363 <li><a href="https://www.oracle.com/corporate/careers/"
1364 data-lbl="about-oracle:careers">Careers</a></li>
1365 <li><a href="https://developer.oracle.com">Developers</a></li>
1366 <li><a href="https://investor.oracle.com/home/default.aspx">Investors</a></li>
1367 <li><a href="https://www.oracle.com/partner/">Partners</a></li>
1368 <li><a href="https://www.oracle.com/startup/">Startups</a></li>
1369 </ul>
1370 </div>
1371 </div>
1372 <div class="u10w2">
1373 <div class="u10w3">
1374 <h5>Why Oracle</h5>
1375 <ul>
1376 <li><a href="https://www.oracle.com/corporate/analyst-reports.html">Analyst Reports</a></li>
1377 <li><a href="https://www.oracle.com/cx/what-is-crm/ ">Best CRM</a></li>
1378 <li><a href="https://www.oracle.com/cloud/economics/">Cloud Economics</a></li>
1379 <li><a href="https://www.oracle.com/corporate/citizenship/">Corporate Responsibility</a>
1380 </li>
1381 <li><a href="https://www.oracle.com/corporate/careers/diversity-inclusion/">Diversity and
1382 Inclusion</a></li>
1383 <li><a href="https://www.oracle.com/corporate/security-practices/">Security Practices</a>
1384 </li>
1385 </ul>
1386 </div>
1387 </div>
1388 <div class="u10w2">
1389 <div class="u10w3">
1390 <h5>Learn</h5>
1391 <ul>
1392 <li><a href="https://www.oracle.com/cx/service/what-is-customer-service/ ">What is Customer
1393 Service?</a></li>
1394 <li><a href=" https://www.oracle.com/erp/what-is-erp/">What is ERP?</a></li>
1395 <li><a
1396 href=" https://www.oracle.com/cx/marketing/automation/what-is-marketing-automation/ ">What
1397 is Marketing Automation?</a></li>
1398 <li><a href="https://www.oracle.com/erp/what-is-procurement/ ">What is Procurement?</a></li>
1399 <li><a
1400 href="https://www.oracle.com/human-capital-management/talent-management/what-is-talent-management/ ">What
1401 is Talent Management?</a></li>
1402 <li><a
1403 href=" https://www.oracle.com/cloud/compute/virtual-machines/what-is-virtual-machine/ ">What
1404 is VM?</a></li>
1405 </ul>
1406 </div>
1407 </div>
1408 <div class="u10w2">
1409 <div class="u10w3">
1410 <h5>What's New</h5>
1411 <ul>
1412 <li><a
1413 href="https://www.oracle.com/cloud/free/?source=:ow:o:h:nav:050120SiteFooter&intcmp=:ow:o:h:nav:050120SiteFooter">Try
1414 Oracle Cloud Free Tier</a></li>
1415 <li><a href="https://www.oracle.com/solutions/green/">Oracle Sustainability</a></li>
1416 <li><a href="https://www.oracle.com/corporate/covid-19.html ">Oracle COVID-19 Response</a>
1417 </li>
1418 <li><a href="https://www.oracle.com/sailgp/">Oracle and SailGP</a></li>
1419 <li><a href="https://www.oracle.com/premier-league/">Oracle and Premier League</a></li>
1420 <li><a href="https://www.oracle.com/redbullracing/">Oracle and Red Bull Racing Honda</a>
1421 </li>
1422 </ul>
1423 </div>
1424 </div>
1425 <div class="u10w2">
1426 <div class="u10w3">
1427 <h5>Contact Us</h5>
1428 <ul>
1429 <li><a href="tel:18006330738">US Sales 1.800.633.0738</a></li>
1430 <li><a href="https://www.oracle.com/corporate/contact/">How can we help?</a></li>
1431 <li><a href="https://go.oracle.com/subscriptions">Subscribe to Oracle Content</a></li>
1432 <li><a
1433 href="https://www.oracle.com/cloud/free/?source=:ow:o:h:nav:050120SiteFooter&intcmp=:ow:o:h:nav:050120SiteFooter">Try
1434 Oracle Cloud Free Tier</a></li>
1435 <li><a href="https://www.oracle.com/events/">Events</a></li>
1436 <li><a href="https://www.oracle.com/news/" data-lbl="news-events:newsroom">News</a></li>
1437 </ul>
1438 </div>
1439 </div>
1440 <div class="u10w4">
1441 <hr />
1442 </div>
1443
1444 <div class="u10w5 ">
1445 <ul class="u10-links u10w10">
1446 <li><a href="https://www.oracle.com/legal/copyright.html" data-lbl="copyright">© 2022 Oracle</a></li>
1447
1448 <li><a data-lbl="privacy"
1449 href="https://www.oracle.com/legal/privacy/">Privacy</a><span>/</span><a
1450 data-lbl="do-not-sell-my-info"
1451 href="https://www.oracle.com/legal/privacy/privacy-choices.html">Do Not Sell My Info</a>
1452 </li>
1453 <li>
1454 <div id='teconsent'> </div>
1455 </li>
1456 <li><a href="https://www.oracle.com/legal/privacy/privacy-policy.html#advertising"
1457 data-lbl="ad-choices">Ad Choices</a></li>
1458 <li><a href="https://www.oracle.com/corporate/careers/" data-lbl="careers">Careers</a></li>
1459 </ul>
1460
1461 </div>
1462 </div>
1463
1464 </div>
1465 <!-- /U10v6 -->
1466
1467
1468 </div>
1469 <script type="text/javascript" src="https://www.oracle.com/us/assets/metrics/ora_compendiumblogs.js"></script>
1470 <script type="text/javascript" src="https://www.oracle.com/assets/truste-oraclelib.js"></script>
1471 <script async="async" type="text/javascript"
1472 src="//consent.trustarc.com/notice?domain=oracle.com&c=teconsent&js=bb¬iceType=bb&text=true>m=1&cdn=1&pcookie"
1473 crossorigin=""></script>
1474 <script type="text/javascript" src="../_cache_8b25/siteinfo-common.js" charset="utf-8"></script><script type="text/javascript" src="../siteinfo-dynamic.js"></script>
1475 <script src="../_cache_8b25/_sitesclouddelivery/renderer/renderer.js"></script>
1476
1477
1478
1479
1480 <script src="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/js/blogs-script.js"></script>
1481 <script src="../_cache_8b25/_themesdelivery/Blogs-New-Theme/assets/js/oracle-script.js"></script>
1482
1483 <!-- Avoid FOUC issue in FF with async loading of style sheets -->
1484 <style>
1485 body {
1486 opacity: 1;
1487 }
1488 </style>
1489
1490 <script type="text/javascript">
1491 $(document).ready(function () {
1492 $('a[data-lbl="copyright"]').html("© " + new Date().getFullYear() +" Oracle " );
1493
1494 });
1495 </script>
1496 <!--DTM/Launch embed code - Footer -->
1497
1498 </body>
1499
1500