x11_x64.html

<!DOCTYPE html>
<html>
<head>
<title>Learn x86-64 assembly by writing a GUI from scratch</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link type="application/atom+xml" href="/blog/feed.xml" rel="self">
<link rel="shortcut icon" type="image/ico" href="/blog/favicon.ico">
<link rel="stylesheet" type="text/css" href="main.css">
<link rel="stylesheet" href="https://unpkg.com/@highlightjs/cdn-assets@11.8.0/styles/default.min.css">
<script src="highlight.min.js"></script>
<!-- From https://github.com/odin-lang/odin-lang.org/blob/6f48c2cfb094a42dffd34143884fa958bd9c0ba2/themes/odin/layouts/partials/head.html#L71 -->
<script src="x86asm.min.js"></script>
<script src="odin_syntax.js"></script>
<script type="module" src="search_index.js"></script>
<script type="module" src="search.js"></script>
</head>
<body>

<div id="banner">
    <div id="name">
        <img id="me" src="me.jpeg">
        <span>Philippe Gaultier</span>
    </div>
    <input id="search" placeholder="🔎 Search" autocomplete=off>
    <ul>
      <li> <a href="/blog/body_of_work.html">Body of work</a> </li>
      <li> <a href="/blog/articles-by-tag.html">Tags</a> </li>
      <li> <a href="https://github.com/gaultier/resume/raw/master/Philippe_Gaultier_resume_en.pdf">
          Resume
        </a> </li>

      <li> <a href="/blog/feed.xml">
        <svg viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
        <path fill-rule="evenodd" clip-rule="evenodd" d="M5.5 3.5C4.39543 3.5 3.5 4.39543 3.5 5.5V18.5C3.5 19.6046 4.39543 20.5 5.5 20.5H18.5C19.6046 20.5 20.5 19.6046 20.5 18.5V5.5C20.5 4.39543 19.6046 3.5 18.5 3.5H5.5ZM7 19C8.10457 19 9 18.1046 9 17C9 15.8954 8.10457 15 7 15C5.89543 15 5 15.8954 5 17C5 18.1046 5.89543 19 7 19ZM6.14863 10.5052C6.14863 10.0379 6.52746 9.65906 6.99478 9.65906C7.95949 9.65906 8.91476 9.84908 9.80603 10.2183C10.6973 10.5874 11.5071 11.1285 12.1893 11.8107C12.8715 12.4929 13.4126 13.3027 13.7817 14.194C14.1509 15.0852 14.3409 16.0405 14.3409 17.0052C14.3409 17.4725 13.9621 17.8514 13.4948 17.8514C13.0275 17.8514 12.6486 17.4725 12.6486 17.0052C12.6486 16.2627 12.5024 15.5275 12.2183 14.8416C11.9341 14.1556 11.5177 13.5324 10.9927 13.0073C10.4676 12.4823 9.84437 12.0659 9.15842 11.7817C8.47246 11.4976 7.73726 11.3514 6.99478 11.3514C6.52746 11.3514 6.14863 10.9725 6.14863 10.5052ZM7 5.15385C6.53268 5.15385 6.15385 5.53268 6.15385 6C6.15385 6.46732 6.53268 6.84615 7 6.84615C8.33342 6.84615 9.65379 7.10879 10.8857 7.61907C12.1176 8.12935 13.237 8.87728 14.1799 9.82015C15.1227 10.763 15.8707 11.8824 16.3809 13.1143C16.8912 14.3462 17.1538 15.6666 17.1538 17C17.1538 17.4673 17.5327 17.8462 18 17.8462C18.4673 17.8462 18.8462 17.4673 18.8462 17C18.8462 15.4443 18.5397 13.9039 17.9444 12.4667C17.3491 11.0294 16.4765 9.72352 15.3765 8.6235C14.2765 7.52349 12.9706 6.65091 11.5333 6.05558C10.0961 5.46026 8.55566 5.15385 7 5.15385Z" fill="#000000"/>
        </svg>
        </a> </li>

      <li> <a href="https://www.linkedin.com/in/philippegaultier/">
        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" data-supported-dps="24x24" fill="currentColor" class="mercado-match" width="24" height="24" focusable="false">
              <path d="M20.5 2h-17A1.5 1.5 0 002 3.5v17A1.5 1.5 0 003.5 22h17a1.5 1.5 0 001.5-1.5v-17A1.5 1.5 0 0020.5 2zM8 19H5v-9h3zM6.5 8.25A1.75 1.75 0 118.3 6.5a1.78 1.78 0 01-1.8 1.75zM19 19h-3v-4.74c0-1.42-.6-1.93-1.38-1.93A1.74 1.74 0 0013 14.19a.66.66 0 000 .14V19h-3v-9h2.9v1.3a3.11 3.11 0 012.7-1.4c1.55 0 3.36.86 3.36 3.66z"/>
            </svg>
        </a> </li>
      <li> <a href="https://github.com/gaultier">
        <svg height="32" aria-hidden="true" viewBox="0 0 24 24" version="1.1" width="32" data-view-component="true" class="octicon octicon-mark-github v-align-middle">
          <path d="M12.5.75C6.146.75 1 5.896 1 12.25c0 5.089 3.292 9.387 7.863 10.91.575.101.79-.244.79-.546 0-.273-.014-1.178-.014-2.142-2.889.532-3.636-.704-3.866-1.35-.13-.331-.69-1.352-1.18-1.625-.402-.216-.977-.748-.014-.762.906-.014 1.553.834 1.769 1.179 1.035 1.74 2.688 1.25 3.349.948.1-.747.402-1.25.733-1.538-2.559-.287-5.232-1.279-5.232-5.678 0-1.25.445-2.285 1.178-3.09-.115-.288-.517-1.467.115-3.048 0 0 .963-.302 3.163 1.179.92-.259 1.897-.388 2.875-.388.977 0 1.955.13 2.875.388 2.2-1.495 3.162-1.179 3.162-1.179.633 1.581.23 2.76.115 3.048.733.805 1.179 1.825 1.179 3.09 0 4.413-2.688 5.39-5.247 5.678.417.36.776 1.05.776 2.128 0 1.538-.014 2.774-.014 3.162 0 .302.216.662.79.547C20.709 21.637 24 17.324 24 12.25 24 5.896 18.854.75 12.5.75Z"/>
        </svg>
        </a> </li>
      <li> <a href="https://hachyderm.io/@pg">
        <svg width="75" height="79" viewBox="0 0 75 79" fill="none" xmlns="http://www.w3.org/2000/svg">
        <path d="M73.8393 17.4898C72.6973 9.00165 65.2994 2.31235 56.5296 1.01614C55.05 0.797115 49.4441 0 36.4582 0H36.3612C23.3717 0 20.585 0.797115 19.1054 1.01614C10.5798 2.27644 2.79399 8.28712 0.904997 16.8758C-0.00358524 21.1056 -0.100549 25.7949 0.0682394 30.0965C0.308852 36.2651 0.355538 42.423 0.91577 48.5665C1.30307 52.6474 1.97872 56.6957 2.93763 60.6812C4.73325 68.042 12.0019 74.1676 19.1233 76.6666C26.7478 79.2728 34.9474 79.7055 42.8039 77.9162C43.6682 77.7151 44.5217 77.4817 45.3645 77.216C47.275 76.6092 49.5123 75.9305 51.1571 74.7385C51.1797 74.7217 51.1982 74.7001 51.2112 74.6753C51.2243 74.6504 51.2316 74.6229 51.2325 74.5948V68.6416C51.2321 68.6154 51.2259 68.5896 51.2142 68.5661C51.2025 68.5426 51.1858 68.522 51.1651 68.5058C51.1444 68.4896 51.1204 68.4783 51.0948 68.4726C51.0692 68.4669 51.0426 68.467 51.0171 68.4729C45.9835 69.675 40.8254 70.2777 35.6502 70.2682C26.7439 70.2682 24.3486 66.042 23.6626 64.2826C23.1113 62.762 22.7612 61.1759 22.6212 59.5646C22.6197 59.5375 22.6247 59.5105 22.6357 59.4857C22.6466 59.4609 22.6633 59.4391 22.6843 59.422C22.7053 59.4048 22.73 59.3929 22.7565 59.3871C22.783 59.3813 22.8104 59.3818 22.8367 59.3886C27.7864 60.5826 32.8604 61.1853 37.9522 61.1839C39.1768 61.1839 40.3978 61.1839 41.6224 61.1516C46.7435 61.008 52.1411 60.7459 57.1796 59.7621C57.3053 59.7369 57.431 59.7154 57.5387 59.6831C65.4861 58.157 73.0493 53.3672 73.8178 41.2381C73.8465 40.7606 73.9184 36.2364 73.9184 35.7409C73.9219 34.0569 74.4606 23.7949 73.8393 17.4898Z" fill="url(#paint0_linear_549_34)"/>
        <path d="M61.2484 27.0263V48.114H52.8916V27.6475C52.8916 23.3388 51.096 21.1413 47.4437 21.1413C43.4287 21.1413 41.4177 23.7409 41.4177 28.8755V40.0782H33.1111V28.8755C33.1111 23.7409 31.0965 21.1413 27.0815 21.1413C23.4507 21.1413 21.6371 23.3388 21.6371 27.6475V48.114H13.2839V27.0263C13.2839 22.7176 14.384 19.2946 16.5843 16.7572C18.8539 14.2258 21.8311 12.926 25.5264 12.926C29.8036 12.926 33.0357 14.5705 35.1905 17.8559L37.2698 21.346L39.3527 17.8559C41.5074 14.5705 44.7395 12.926 49.0095 12.926C52.7013 12.926 55.6784 14.2258 57.9553 16.7572C60.1531 19.2922 61.2508 22.7152 61.2484 27.0263Z" fill="white"/>
        <defs>
        <linearGradient id="paint0_linear_549_34" x1="37.0692" y1="0" x2="37.0692" y2="79" gradientUnits="userSpaceOnUse">
        <stop stop-color="#6364FF"/>
        <stop offset="1" stop-color="#563ACC"/>
        </linearGradient>
        </defs>
        </svg>
        </a> </li>
      <li> <a href="https://bsky.app/profile/pgaultier.bsky.social">
        <svg fill="none" viewBox="0 0 64 57" width="32" style="width: 32px; height: 28.5px;"><path fill="#0085ff" d="M13.873 3.805C21.21 9.332 29.103 20.537 32 26.55v15.882c0-.338-.13.044-.41.867-1.512 4.456-7.418 21.847-20.923 7.944-7.111-7.32-3.819-14.64 9.125-16.85-7.405 1.264-15.73-.825-18.014-9.015C1.12 23.022 0 8.51 0 6.55 0-3.268 8.579-.182 13.873 3.805ZM50.127 3.805C42.79 9.332 34.897 20.537 32 26.55v15.882c0-.338.13.044.41.867 1.512 4.456 7.418 21.847 20.923 7.944 7.111-7.32 3.819-14.64-9.125-16.85 7.405 1.264 15.73-.825 18.014-9.015C62.88 23.022 64 8.51 64 6.55c0-9.818-8.578-6.732-13.873-2.745Z"/></svg>
        </a> </li>
    </ul>
</div>
<div id="search-matches" hidden>
</div>
<div id="pseudo-body">

<div class="article-prelude">
  <p><a href="/blog"> ⏴ Back to all articles</a></p>

  <p class="publication-date">Published on 2023-05-31</p>
</div>
<div class="article-title">
<h1>Learn x86-64 assembly by writing a GUI from scratch</h1>
  <div class="tags"> <a href="/blog/articles-by-tag.html#gui" class="tag">GUI</a> <a href="/blog/articles-by-tag.html#x86-64" class="tag">x86_64</a> <a href="/blog/articles-by-tag.html#x11" class="tag">X11</a> <a href="/blog/articles-by-tag.html#optimization" class="tag">Optimization</a></div>
  </div>
 <strong>Table of contents</strong>
<ul>

  <li>
    <a href="#3018859686-what-do-we-need">What do we need?</a>
  </li>

  <li>
    <a href="#2049729589-x11-basics">X11 basics</a>
  </li>

  <li>
    <a href="#1992549332-main-in-x64-assembly">Main in x64 assembly</a>
  </li>

  <li>
    <a href="#2732446636-a-stack-primer">A stack primer</a>
<ul>

  <li>
    <a href="#657479577-a-small-stack-example">A small stack example</a>
  </li>
</ul>
  </li>

  <li>
    <a href="#4163415294-opening-a-socket">Opening a socket</a>
  </li>

  <li>
    <a href="#2750592591-connecting-to-the-server">Connecting to the server</a>
  </li>

  <li>
    <a href="#484246098-sending-data-over-the-socket">Sending data over the socket</a>
  </li>

  <li>
    <a href="#1309822244-generating-ids">Generating ids</a>
  </li>

  <li>
    <a href="#4134081642-opening-a-font">Opening a font</a>
  </li>

  <li>
    <a href="#3515439192-creating-a-graphical-context">Creating a graphical context</a>
  </li>

  <li>
    <a href="#2863200396-creating-the-window">Creating the window</a>
  </li>

  <li>
    <a href="#577694983-mapping-the-window">Mapping the window</a>
  </li>

  <li>
    <a href="#677275119-polling-for-server-messages">Polling for server messages</a>
  </li>

  <li>
    <a href="#3433791877-drawing-text">Drawing text</a>
  </li>

  <li>
    <a href="#1770781618-the-end">The end</a>
  </li>

  <li>
    <a href="#1512890027-addendum-the-full-code">Addendum: the full code</a>
  </li>
</ul>

<p><em>Discussions: <a href="https://news.ycombinator.com/item?id=36153237">Hacker News</a>, <a href="https://old.reddit.com/r/programming/comments/13xgbk6/learn_x8664_assembly_by_writing_a_gui_from_scratch/">r/programming</a>, <a href="https://lobste.rs/s/dvtzfl/learn_x86_64_assembly_by_writing_gui_from">Lobsters</a>.</em></p>
<p>Most people think assembly is only to be used to write toy programs for learning purposes, or to write a highly optimized version of a specific function inside a codebase written in a high-level language.</p>
<p>Well, what if we wrote a whole program in assembly that opens a GUI window? It will be the hello world of the GUI world, but that still counts. Here is what we are working towards:</p>
<p><img src="x11_x64_final.png" alt="Result" /></p>
<p>I wanted to expand my knowledge of assembly and by doing something fun and motivating. It all originated from the observation that so many program binaries today are very big, often over 30 Mib (!), and I asked myself: How small a binary can be for a (very simplistic) GUI? Well, it turns out, very little. Spoiler alert: around 1 KiB!</p>
<blockquote>
<p>I am by no means an expert in assembly or in X11. I just hope to provide an entertaining, approachable article, something a beginner can understand. Something I wished I had found when I was learning those topics. If you spot an error, please open a <a href="https://github.com/gaultier/blog">Github issue</a>!</p>
</blockquote>
<p><em>Note: Authentication is optional in the X11 protocol, but some X11 servers e.g. XWayland require it. Authentication is skipped here and is handled in a separate <a href="/blog/write_a_video_game_from_scratch_like_1987.html#authentication">article</a>.</em></p>
<h2 id="3018859686-what-do-we-need">
  <a class="title" href="#3018859686-what-do-we-need">What do we need?</a>
  <a class="hash-anchor" href="#3018859686-what-do-we-need" aria-hidden="true" onclick="navigator.clipboard.writeText(this.href);"></a>
</h2>
<p>I will be using the <code>nasm</code> assembler which is simple, cross-platform, fast, and has quite a readable syntax.</p>
<p>For the GUI, I will be using X11 since I am based on Linux and it has some interesting properties that make it easy to do without external libraries. If you are running Wayland, it should work with XWayland out of the box (<em>EDIT: After testing it, I can confirm it does work</em>), and perhaps also on macOS with XQuartz, but I have not tested those (for macOS, remember to tell <code>nasm</code> to use the <code>macho64</code> format, since macOS does not use the ELF format! Also, the stock linker on macOS does not support <code>-static</code>.).</p>
<p>Note that the only difference between *nix operating systems in the context of this program is the system call values. Since I am based on Linux I will be using the Linux system call values, but 'porting' this program to, say, FreeBSD, would only require to change those values, possibly using the <code>nasm</code> macros:</p>
<pre><code class="language-x86asm">%ifdef linux
  %define SYSCALL_EXIT 60
%elifdef freebsd
  %define SYSCALL_EXIT 1
%endif
</code></pre>
<blockquote>
<p><code>%define</code> and its variants are part of the macro system in <code>nasm</code>, which is powerful but we will only use it here to define constants, just like in C: <code>#define FOO 3</code>.</p>
</blockquote>
<p>No need for additional tooling to cross-compile, issues with dynamic libraries, libc differences, etc. Just compile on Linux by defining the right variable on the command line, send the binary to your friend on FreeBSD, and it just works(tm). That's refreshing.</p>
<blockquote>
<p>Some readers have rightfully pointed out that Linux is the only mainstream operating system that officially provides a stable userland ABI, other OSes often break their ABI from (major) version to version and recommend all programs to link to a library (e.g. <code>libSystem</code> in the case of macOS). That layer guarantees API stability, and acts as a insulation layer from breaking changes in the ABI. In practice, for common system calls such as the ones we use here, they very rarely break, but doing more exotic things may break in the future. That actually happened to the Go project in the past on macOS! The solution if that happens is to simply recompile the program on the new version of the OS.</p>
</blockquote>
<p>So let's dive in!</p>
<h2 id="2049729589-x11-basics">
  <a class="title" href="#2049729589-x11-basics">X11 basics</a>
  <a class="hash-anchor" href="#2049729589-x11-basics" aria-hidden="true" onclick="navigator.clipboard.writeText(this.href);"></a>
</h2>
<p>X11 is a server accessible over the network that handles windowing and rendering inside those windows. A client opens a socket, connects to the server, and sends commands in a specific format to open a window, draw shapes, text, etc. The server sends message about errors or events to the client.</p>
<p>Most applications will want to use <code>libX11</code> or <code>libxcb</code> which offer a C API, but we want to do that ourselves.</p>
<p>Where the server lives is actually not relevant for a client, it might run on the same machine or in a data center far far away. Of course, in the context of a desktop computer in 2023, it will be running on the same machine, but that's a detail.</p>
<p>The <a href="https://www.x.org/releases/X11R7.7/doc/xproto/x11protocol.html">official documentation</a> is pretty good, so when in doubt we can refer to it.</p>
<h2 id="1992549332-main-in-x64-assembly">
  <a class="title" href="#1992549332-main-in-x64-assembly">Main in x64 assembly</a>
  <a class="hash-anchor" href="#1992549332-main-in-x64-assembly" aria-hidden="true" onclick="navigator.clipboard.writeText(this.href);"></a>
</h2>
<p>Let's start slow with minimal program that simply exits with 0, and build from there.</p>
<p>First, we tell nasm we are writing a 64 bit program and that we target x86_64. Then, we need a main function, which we call <code>_start</code> and needs to be visible since this is the entry point of our program (hence the <code>global</code> keyword):</p>
<pre><code class="language-x86asm">; Comments start with a semicolon!
BITS 64 ; 64 bits.
CPU X64 ; Target the x86_64 family of CPUs.

section .text
global _start
_start:
  xor rax, rax ; Set rax to 0. Not actually needed, it's just to avoid having an empty body.
</code></pre>
<p><code>section .text</code> is telling <code>nasm</code> and the linker, that what follows is code that should be placed in the text section of the executable.</p>
<p>We will soon have a <code>section .data</code> for our global variables.</p>
<p>Note that those section usually get mapped by the OS to different pages in memory with different permissions (visible with <code>readelf -l</code>) so that the text section is not writable and the data section is not executable, but that varies from OS to OS.</p>
<p>The <code>_start</code> function has a body that does nothing for now, but not for long. The actual name of the main function is actually up to us, it's just that <code>start</code> or <code>_start</code> is usual.</p>
<p>We build and run our little program like this:</p>
<pre><code class="language-sh">$ nasm -f elf64 -g main.nasm &amp;&amp; ld main.o -static -o main
</code></pre>
<p><code>nasm</code> actually only produces an object file, so to get an executable out of it, we need to invoke the linker <code>ld</code>. The flag <code>-g</code> is telling <code>nasm</code> to produce debugging information which is immensely useful when writing raw assembly, since firing the debugger is often our only recourse in face of a bug.</p>
<p><em>To remove the debugging information, we can pass <code>-s</code> to the linker, for example when we are about to ship our program and want to save a few KiB.</em></p>
<p>We finally have an executable:</p>
<pre><code class="language-sh">$ file ./main
main: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), statically linked, with debug_info, not stripped
</code></pre>
<p>We can see the different sections with <code>readelf -a ./main</code>, and it tells us that the <code>.text</code> section, which contains our code, is only 3 bytes long.</p>
<p>Now, if we try to run our program, it will segfault. That's because we are expected by the operating system to exit (using the exit system call) ourselves (otherwise the CPU will keep executing whatever comes after our entry point until it hits an unmapped page, triggering a segfault). That's what libc does for us in C programs, so let's handle that:</p>
<pre><code class="language-x86asm">%define SYSCALL_EXIT 60

global _start:
_start:
  mov rax, SYSCALL_EXIT
  mov rdi, 0
  syscall
</code></pre>
<blockquote>
<p><code>nasm</code> uses the Intel syntax: <code>&lt;instruction&gt; &lt;destination&gt;, &lt;source&gt;</code>, so <code>mov rdi, 0</code> puts 0 into the register <code>rdi</code>. Other assemblers use the AT&amp;T syntax which swaps the source and destination. My advice: pick one syntax and one assembler and stick to it, both syntaxes are fine and most tools have some support for both.</p>
</blockquote>
<p>Following the System V ABI, which is required on Linux and other Unices for system calls, invoking a system call requires us to put the system call code in the register <code>rax</code>, the parameters to the syscall (up to 6) in the registers <code>rdi</code>, <code>rsi</code>, <code>rdx</code>, <code>rcx</code>, <code>r8</code>, <code>r9</code>, and additional parameters, if any, on the stack (which will not happen in this program so we can forget about it).
We then use the instruction <code>syscall</code> and check <code>rax</code> for the return value, <code>0</code> usually meaning: no error.</p>
<p><em>Note that Linux (and perhaps other Unices?) has a 'fun' difference, which is that the fourth parameter of a system call is actually passed using the register <code>r10</code>.</em></p>
<blockquote>
<p>Astute readers have pointed out that this is the case across all OSes and documented in the x86_64 architecture supplement of the System V ABI. The more you know! That's only for system calls, though, regular functions still use <code>rcx</code> for the fourth parameter.</p>
</blockquote>
<blockquote>
<p>Note that the System V ABI is required when making system calls and when interfacing with C but we are free to use whatever conventions we want in our own assembly code. For a long time, Go was using a different calling convention than the System V ABI, for example, when calling functions (passing arguments on the stack). Most tools (debuggers, profilers) expect the System V ABI though, so I recommend sticking to it.</p>
</blockquote>
<p>Back to our program: when we run it, we see...nothing. That's because everything went well, true to the UNIX philosophy!</p>
<p>We can check the exit code:</p>
<pre><code class="language-sh">$ ./main; echo $?
0
</code></pre>
<p>Changing <code>mov rdi, 0</code> to <code>mov rdi, 8</code> will now result in:</p>
<pre><code class="language-sh">$ ./main; echo $?
8
</code></pre>
<p>Another way to observe system calls made by a program is with <code>strace</code>, which will also prove very useful when troubleshooting. On some BSD, its equivalent is <code>truss</code> or <code>dtruss</code>.</p>
<pre><code class="language-sh">$ strace ./main
execve(&quot;./main&quot;, [&quot;./main&quot;], 0x7ffc60e6bf10 /* 60 vars */) = 0
exit(8)                                 = ?
+++ exited with 8 +++
</code></pre>
<p>Let's change it back to 0 and continue.</p>
<h2 id="2732446636-a-stack-primer">
  <a class="title" href="#2732446636-a-stack-primer">A stack primer</a>
  <a class="hash-anchor" href="#2732446636-a-stack-primer" aria-hidden="true" onclick="navigator.clipboard.writeText(this.href);"></a>
</h2>
<p>Before we can continue, we need to know the basics of how the stack works in assembly since we have no friendly compiler to do that for us.</p>
<p><strong>The three most important things about the stack are:</strong></p>
<ul>
<li>It grows downwards: to reserve more space on the stack, we decrease the value of <code>rsp</code></li>
<li>A function must restore the stack pointer to its original value before the function returns, meaning, either remember the original value and set <code>rsp</code> to this, or, match every decrement by an increment of the same value.</li>
<li>Before a function call, the stack pointer needs to be 16 bytes aligned, according to the System V ABI. Also, at the very beginning of a function, the stack pointer value is: <code>16*N + 8</code>. That's because before the function call, its value was 16 byte aligned, i.e. <code>16*N</code>, and the <code>call</code> instruction pushes on the stack the current location (the register <code>rip</code>, which is 8 bytes long), to know where to jump when the called function returns.</li>
</ul>
<p>Not abiding by those rules will result in nasty crashes, so be warned. That's because the location of where to jump when the function returns will be likely overwritten and the program will jump to the wrong location. That, or the stack content will be overwritten and the program will operate on wrong values. Bad either way.</p>
<h3 id="657479577-a-small-stack-example">
  <a class="title" href="#657479577-a-small-stack-example">A small stack example</a>
  <a class="hash-anchor" href="#657479577-a-small-stack-example" aria-hidden="true" onclick="navigator.clipboard.writeText(this.href);"></a>
</h3>
<p>Let's write a function that prints <code>hello</code> to the standard out, using the stack, to learn the ropes. An easier way would be to store this static string in the <code>.rodata</code> section, but that would not teach us anything about the stack.</p>
<p>We need to reserve (at least) 5 bytes on the stack, since that's the length in bytes of <code>hello</code>.</p>
<p>The stack looks like this:</p>
<table>
  <tr> <td align="left">...</td> </tr>
  <tr> <td align="left" >rbp</td> </tr>
  <tr> <td align="left" >o</td> </tr>
  <tr> <td align="left" >l</td> </tr>
  <tr> <td align="left" >l</td> </tr>
  <tr> <td align="left" >e</td> </tr>
  <tr> <td align="left" >h</td> </tr>
</table>
<p>And <code>rsp</code> points to the bottom of it.</p>
<p>Here's how we access each element:</p>
<table>
  <thead>
  <tr> <th>Memory location (example)</th> <th>Assembly code</th> <th align="left">Stack element</th> </tr>
  </thead>
  <tbody>
  <tr> <td>0x1016</td> <td></td> <td align="left">...</td> </tr>
  <tr> <td>0x1015</td> <td>rsp + 5</td> <td align="left" >rbp</td> </tr>
  <tr> <td>0x1014</td> <td>rsp + 4</td> <td align="left" >o</td> </tr>
  <tr> <td>0x1013</td> <td>rsp + 3</td> <td align="left" >l</td> </tr>
  <tr> <td>0x1012</td> <td>rsp + 2</td> <td align="left" >l</td> </tr>
  <tr> <td>0x1011</td> <td>rsp + 1</td> <td align="left" >e</td> </tr>
  <tr> <td>0x1010</td> <td>rsp + 0</td> <td align="left" >h</td> </tr>
  </tbody>
</table>
<p>We then pass the address on the stack of the beginning of the string to the <code>write</code> syscall, as well as its length:</p>
<pre><code class="language-asm">%define SYSCALL_WRITE 1
%define STDOUT 1

print_hello:
  push rbp ; Save rbp on the stack to be able to restore it at the end of the function.
  mov rbp, rsp ; Set rbp to rsp

  sub rsp, 5 ; Reserve 5 bytes of space on the stack.
  mov BYTE [rsp + 0], 'h' ; Set each byte on the stack to a string character.
  mov BYTE [rsp + 1], 'e'
  mov BYTE [rsp + 2], 'l'
  mov BYTE [rsp + 3], 'l'
  mov BYTE [rsp + 4], 'o'

  ; Make the write syscall
  mov rax, SYSCALL_WRITE
  mov rdi, STDOUT ; Write to stdout.
  lea rsi, [rsp] ; Address on the stack of the string.
  mov rdx, 5 ; Pass the length of the string which is 5.
  syscall

  add rsp, 5 ; Restore the stack to its original value.

  pop rbp ; Restore rbp
  ret
</code></pre>
<blockquote>
<p><code>lea destination, source</code> loads the effective address of the source into the destination, which is how C pointers are implemented. To dereference a memory location we use square brackets. So, assuming we just have loaded an address into <code>rdi</code> with <code>lea</code>, e.g. <code>lea rdi, [hello_world]</code>, and we want to store the value at the address into <code>rax</code>, we do: <code>mov rax, [rdi]</code>. We usually have to tell <code>nasm</code> how many bytes to dereference with <code>BYTE</code>, <code>WORD</code>, <code>DWORD</code>, <code>QWORD</code> so: <code>mov rax, DWORD [rdi]</code>, because <code>nasm</code> does not keep track of the sizes of each variable. That's also what the C compiler does when we dereference a <code>int8_t</code>, <code>int16_t</code>, <code>int32_t</code>, and <code>int64_t</code> pointer, respectively.</p>
</blockquote>
<p>There is a lot to unpack here.</p>
<p>First, what is <code>rbp</code>? That's a register like any other. But, you can choose to follow the convention of not using this register like the other registers, to store arbitrary values, and instead, use it to store a linked list of call frames. That's a lot of words.</p>
<p>Basically, at the very beginning of a function, the value of <code>rbp</code> is stored on the stack (that's <code>push rbp</code>). Since <code>rbp</code> stores an address (the address of the frame that's called us), we are storing on the stack the address of the caller in a known location.</p>
<p>Immediately after that, we set <code>rbp</code> to <code>rsp</code>, that is, to the stack pointer at the beginning of the function. <code>push rbp</code> and <code>mov rbp, rsp</code> are thus usually referred to as the function prolog.</p>
<p>For the rest of the function body, we treat <code>rbp</code> as a constant and only decrease <code>rsp</code> if we need to reserve space on the stack.</p>
<p>So if function A calls function B which in turn calls function C, and each function stores on the stack the address of the caller frame, we know where to find on the stack the address of each. Thus, we can print a stack trace in any location of our program simply by inspecting the stack. Pretty nifty. That's already very useful to profilers and other similar tools.</p>
<p>We must not forget of course, just before we exit the function, to restore <code>rbp</code> to its original value (which is still on the stack at that point): that's <code>pop rbp</code>. This is also known as the function epilog. Another way to look at it is that we remove the last element of the linked list of call frames, since we are exiting the leaf function.</p>
<p>Don't worry if you have not fully understood everything, just remember to always have the function epilogs and prologs and you'll be fine:</p>
<pre><code class="language-x86asm">my_function:
  push rbp
  mov rbp, rsp

  sub rsp, N

  [...]


  add rsp, N
  pop rbp
  ret
</code></pre>
<p><strong>Note</strong>: There is an optimization method that uses <code>rbp</code> as a standard register (with a C compiler, that's the flag <code>-fomit-frame-pointer</code>), which means we lose the information about the call stack. My advice is: never do this, it is no worth it.</p>
<blockquote>
<p>Wait, but didn't you say the stack needs to be 16 byte aligned (that is, a multiple of 16)? Last time I checked, 5 is not really a multiple of 16!</p>
</blockquote>
<p>Good catch! The only reason why this program works, is that <code>print_hello</code> is a leaf function, meaning it does not call another function. Remember, the stack needs to be 16 bytes aligned when we do a <code>call</code>!</p>
<p>So the correct way would be:</p>
<pre><code class="language-asm">print_hello:
  push rbp
  mov rbp, rsp

  sub rsp, 16
  mov BYTE [rsp + 0], 'h'
  mov BYTE [rsp + 1], 'e'
  mov BYTE [rsp + 2], 'l'
  mov BYTE [rsp + 3], 'l'
  mov BYTE [rsp + 4], 'o'

  mov rax, SYSCALL_WRITE
  mov rdi, STDOUT
  lea rsi, [rsp]
  mov rdx, 5
  syscall

  call print_world

  add rsp, 16

  pop rbp
  ret
</code></pre>
<p>Since when we enter the function, the value of <code>rsp</code> is <code>16*N+8</code>, and pushing <code>rbp</code> increases it by 8, the stack pointer is 16 bytes aligned at the point of <code>sub rsp, 16</code>. Decrementing it by 16 (or a multiple of 16) keeps it 16 bytes aligned.</p>
<p>We now can safely call another function from within <code>print_hello</code>:</p>
<pre><code class="language-x86asm">print_world:
  push rbp
  mov rbp, rsp

  sub rsp, 16
  mov BYTE [rsp + 0], ' '
  mov BYTE [rsp + 1], 'w'
  mov BYTE [rsp + 2], 'o'
  mov BYTE [rsp + 3], 'r'
  mov BYTE [rsp + 4], 'l'
  mov BYTE [rsp + 5], 'd'

  mov rax, SYSCALL_WRITE
  mov rdi, STDOUT
  lea rsi, [rsp]
  mov rdx, 6
  syscall

  add rsp, 16

  pop rbp
  ret

print_hello:
  push rbp
  mov rbp, rsp

  sub rsp, 16
  mov BYTE [rsp + 0], 'h'
  mov BYTE [rsp + 1], 'e'
  mov BYTE [rsp + 2], 'l'
  mov BYTE [rsp + 3], 'l'
  mov BYTE [rsp + 4], 'o'

  mov rax, SYSCALL_WRITE
  mov rdi, STDOUT
  lea rsi, [rsp]
  mov rdx, 5
  syscall

  call print_world

  add rsp, 16

  pop rbp
  ret
</code></pre>
<p>And we get <code>hello world</code> as an output.</p>
<p>Now, try to do <code>sub rsp, 5</code> in <code>print_hello</code>, and your program <em>may</em> crash. There is no guarantee, that's what makes it hard to track down.</p>
<p>My advice is:</p>
<ul>
<li>Always use the standard function prologs and epilogs</li>
<li>Always increment/decrement <code>rsp</code> by (a multiple of) 16</li>
<li>Address items on the stack relative to <code>rsp</code>, i.e. <code>mov BYTE [rsp + 4], 'o'</code></li>
<li>If you have to decrement <code>rsp</code> by a value that's unknown at compile time (similar to how <code>alloca()</code> works in C), you can <code>and rsp, -16</code> to 16 bytes align it.</li>
</ul>
<p>And you'll be safe.</p>
<p>The last point is interesting, see for yourself:</p>
<pre><code class="language-shell">(gdb) p -100 &amp; -16
$1 = -112
(gdb) p -112 &amp; -16
$2 = -112
</code></pre>
<p>Which translates in assembly to:</p>
<pre><code class="language-asm">sub rsp, 100
and rsp, -16
</code></pre>
<p>Finally, following those conventions means that our assembly functions can be safely called from C or other languages following the <a href="https://wiki.osdev.org/System_V_ABI">System V ABI</a>, without any modification, which is great.</p>
<p><em>I have not talked about the red zone which is a 128 byte region at the bottom of the stack which our program is free to use as it pleases without having to change the stack pointer. In my opinion, it is not helpful and creates hard to track bugs, so I do not recommend to use it. To disable it entirely, run: <code>nasm -f elf64 -g main.nasm &amp;&amp; cc main.o -static -o main -mno-red-zone -nostdlib</code></em>.</p>
<h2 id="4163415294-opening-a-socket">
  <a class="title" href="#4163415294-opening-a-socket">Opening a socket</a>
  <a class="hash-anchor" href="#4163415294-opening-a-socket" aria-hidden="true" onclick="navigator.clipboard.writeText(this.href);"></a>
</h2>
<p>We now are ready to open a socket with the <code>socket(2)</code> syscall, so we add a few constants, taken from the libc headers (<em>note that those values might actually be different on a different Unix, I have not checked. Again, a few <code>%ifdef</code> can easily remedy this discrepancy</em>):</p>
<pre><code class="language-x86asm">%define AF_UNIX 1
%define SOCK_STREAM 1

%define SYSCALL_SOCKET 41
</code></pre>
<p>The <code>AF_UNIX</code> constant means we want a Unix domain socket, and <code>SOCK_STREAM</code> means <a href="https://en.wikipedia.org/wiki/Unix_domain_socket">stream-oriented</a>. We use a domain socket since we now that our server is running on the same machine and it should be faster, but we could change it to <code>AF_INET</code> to connect to a remote IPv4 address for example.</p>
<p>We then fill the relevant registers with those values and invoke the system call:</p>
<pre><code class="language-x86asm">  mov rax, SYSCALL_SOCKET
  mov rdi, AF_UNIX ; Unix socket.
  mov rsi, SOCK_STREAM ; Stream oriented.
  mov rdx, 0 ; Automatic protocol.
  syscall
</code></pre>
<p>The C equivalent would be: <code>socket(AF_UNIX, SOCK_STREAM, 0);</code>. So you see that if we fill the registers in the same order as the C function parameters, we stay close to what C code would do.</p>
<p>The whole program now looks like this:</p>
<pre><code class="language-x86asm">BITS 64 ; 64 bits.
CPU X64 ; Target the x86_64 family of CPUs.

section .text

%define AF_UNIX 1
%define SOCK_STREAM 1

%define SYSCALL_SOCKET 41
%define SYSCALL_EXIT 60

global _start:
_start:
  ; open a unix socket.
  mov rax, SYSCALL_SOCKET
  mov rdi, AF_UNIX ; Unix socket.
  mov rsi, SOCK_STREAM ; Stream oriented.
  mov rdx, 0 ; automatic protocol.
  syscall


  ; The end.
  mov rax, SYSCALL_EXIT
  mov rdi, 0
  syscall
</code></pre>
<p>Building and running it under <code>strace</code> shows that it works and we get a socket with the file descriptor <code>3</code> (in this case, it might be different for you if you are following at home):</p>
<pre><code class="language-sh">$ nasm -f elf64 -g main.nasm &amp;&amp; ld main.o -static -o main 
$ strace ./main
execve(&quot;./main&quot;, [&quot;./main&quot;], 0x7ffe54dfe550 /* 60 vars */) = 0
socket(AF_UNIX, SOCK_STREAM, 0)         = 3
exit(0)                                 = ?
+++ exited with 0 +++
</code></pre>
<h2 id="2750592591-connecting-to-the-server">
  <a class="title" href="#2750592591-connecting-to-the-server">Connecting to the server</a>
  <a class="hash-anchor" href="#2750592591-connecting-to-the-server" aria-hidden="true" onclick="navigator.clipboard.writeText(this.href);"></a>
</h2>
<p>Now that we have created a socket, we can connect to the server with the <code>connect(2)</code> system call.</p>
<p>It's a good time to extract that logic in its own little function, just like in any other high-level language.</p>
<pre><code class="language-x86asm">x11_connect_to_server:
  ; TODO
</code></pre>
<p>In assembly, a function is simply a label we can jump to. But for clarity, both for readers of the code and tools, we can add a hint that this is a real function we can call, like this: <code>call x11_connect_to_server</code>. This will improve the call stack for example when using <code>strace -k</code>. This hint has the form (in <code>nasm</code>): <code>static &lt;name of the function&gt;:function</code>.</p>
<p>Of course, we also need to add our standard function prolog and epilog:</p>
<pre><code class="language-x86asm">x11_connect_to_server:
static x11_connect_to_server:function
  push rbp
  mov rbp, rsp
  
  pop rbp
  ret
</code></pre>
<p>An additional help when reading functions in assembly code is adding comments describing what parameters they accept and what is the return value, if any. Since there is no language level feature for this, we resort to comments:</p>
<pre><code class="language-x86asm">; Create a UNIX domain socket and connect to the X11 server.
; @returns The socket file descriptor.
x11_connect_to_server:
static x11_connect_to_server:function
  push rbp
  mov rbp, rsp
  
  pop rbp
  ret
</code></pre>
<p>First, let's move the socket creation logic to our function and call it in the program:</p>
<pre><code class="language-x86asm">; Create a UNIX domain socket and connect to the X11 server.
; @returns The socket file descriptor.
x11_connect_to_server:
static x11_connect_to_server:function
  push rbp
  mov rbp, rsp
  
  ; Open a Unix socket: socket(2).
  mov rax, SYSCALL_SOCKET
  mov rdi, AF_UNIX ; Unix socket.
  mov rsi, SOCK_STREAM ; Stream oriented.
  mov rdx, 0 ; Automatic protocol.
  syscall

  cmp rax, 0
  jle die

  mov rdi, rax ; Store socket fd in `rdi` for the remainder of the function.

  pop rbp
  ret

die:
  mov rax, SYSCALL_EXIT
  mov rdi, 1
  syscall

_start:
global _start:function
  call x11_connect_to_server
  
  ; The end.
  mov rax, SYSCALL_EXIT
  mov rdi, 0
  syscall
</code></pre>
<p>The error checking is very simplistic: we only check that the return value of the system call (in <code>rax</code>) is what we expect, otherwise we exit the program with a non-zero code by jumping to the <code>die</code> section.</p>
<blockquote>
<p><code>jle</code> is a conditional jump, which inspects global flags, hopefully set just before with <code>cmp</code> or <code>test</code>, and jumps to a label if the condition is true. Here, we compare the returned value with 0, and if it is lower or equal to 0, we jump to the error label. That's how we implement conditionals and loops.</p>
</blockquote>
<hr />
<p>Ok, we can finally connect to the server now. The <code>connect(2)</code> system call takes the address of a <code>sockaddr_un</code> structure as the second argument. This structure is too big to fit in a register.</p>
<p>This is the first syscall we encounter that needs to be passed a pointer, in other words, the address of a region in memory. That region can be on the stack or on the heap, or even be our own executable mapped in memory. That's assembly, we get to do whatever we want.</p>
<p>Since we want to keep things simple and fast, we will store everything in this program on the stack. And since we have 8 MiB of it (according to <code>limit</code>, on my machine, that is), it'll be plenty enough. Actually, the most space we will need on the stack in this program will be 32 KiB.</p>
<p>The size of the <code>sockaddr_un</code> structure is 110 bytes, so we reserve 112 to align <code>rsp</code> to 16 bytes.</p>
<blockquote>
<p>Nasm does have structs, but they are rather a way to define offsets with a name, than structures like in C with a specific syntax to address a specific field. For the sake of simplicity, I'll use the manual way, without <code>nasm</code> structs.</p>
</blockquote>
<p>We set the first 2 bytes of this structure to <code>AF_UNIX</code> since this is a domain socket. Then comes the path of the Unix domain socket which X11 expects to be in a certain format. We want to display our window on the first monitor starting at 0, so the string is: <code>/tmp/.X11-unix/X0</code>.</p>
<p>In C, we would do:</p>
<pre><code class="language-c">  const sockaddr_un addr = {.sun_family = AF_UNIX,
                            .sun_path = &quot;/tmp/.X11-unix/X0&quot;};
  const int res =
      connect(x11_socket_fd, (const struct sockaddr *)&amp;addr, sizeof(addr));
</code></pre>
<p>How do we translate that to assembly, especially the string part?</p>
<p>We could set each byte to each character of the string in the structure, on the stack, manually, one by one. Another <a href="https://en.wikibooks.org/wiki/X86_Assembly/Data_Transfer#Move_String">way</a> to do it is to use the <code>rep movsb</code> idiom, which instructs the CPU to copy a character from a string A to another string B, N times. This is exactly what we need!</p>
<p>The way it works is:</p>
<ul>
<li>We put the string in the <code>.rodata</code> section (same as the data section but read-only)</li>
<li>We load its address in <code>rsi</code> (it's the source)</li>
<li>We load the address of the string in the structure on the stack in <code>rdi</code> (it's the destination)</li>
<li>We set <code>rcx</code> to the number of bytes to be copied</li>
<li>We use <code>cld</code> to clear the <code>DF</code> flag to ensure the copy is done forwards (since it can also be done backwards)</li>
<li>We call <code>rep movsb</code> and voila</li>
</ul>
<p>It's basically <code>memcpy</code> from C.</p>
<blockquote>
<p>This is a interesting case: we can see that some instructions expect some of their operands to be in certain registers and there is no way around it.  So, we have to plan ahead and expect those registers to be overwritten. If we need to keep their original values around, we have to store those values elsewhere, for example on the stack (that's called spilling) or in other registers. This is a broader topic of register allocation which is NP-hard! In small functions, it's manageable though.</p>
</blockquote>
<p>First, the <code>.rodata</code> section:</p>
<pre><code class="language-x86asm">section .rodata

sun_path: db &quot;/tmp/.X11-unix/X0&quot;, 0
static sun_path:data
</code></pre>
<p>Then we copy the string:</p>
<pre><code class="language-x86asm">  mov WORD [rsp], AF_UNIX ; Set sockaddr_un.sun_family to AF_UNIX
  ; Fill sockaddr_un.sun_path with: &quot;/tmp/.X11-unix/X0&quot;.
  lea rsi, sun_path
  mov r12, rdi ; Save the socket file descriptor in `rdi` in `r12`.
  lea rdi, [rsp + 2]
  cld ; Move forward
  mov ecx, 19 ; Length is 19 with the null terminator.
  rep movsb ; Copy.
</code></pre>
<blockquote>
<p><code>ecx</code> is the 32 bit form of the register <code>rcx</code>, meaning we only set here the lower 32 bits of the 64 bit register. <a href="https://wiki.osdev.org/CPU_Registers_x86-64">This handy table</a> lists all of the forms for all of the registers. But be cautious of the pitfall case of only setting a value in part of a register, and then using the whole register later. The rest of the bits that have not been set will contain some past value, which is hard to troubleshoot. The solution is to use <code>movzx</code> to zero extend, meaning setting the rest of the bits to 0. A good way to visualize this is to use <code>info registers</code> within gdb, and that will display for each register the value for each of its forms, e.g. for <code>rcx</code>, it will display the value for <code>rcx</code>, <code>ecx</code>, <code>cx</code>, <code>ch</code>, <code>cl</code>.</p>
</blockquote>
<p>Then, we do the syscall, check the returned value, exit the program if the value is not 0, and finally return the socket file descriptor, which will be used every time in the rest of the program when talking to the X11 server.</p>
<p>Everything together, it looks like:</p>
<pre><code class="language-x86asm">; Create a UNIX domain socket and connect to the X11 server.
; @returns The socket file descriptor.
x11_connect_to_server:
static x11_connect_to_server:function
  push rbp
  mov rbp, rsp 

  ; Open a Unix socket: socket(2).
  mov rax, SYSCALL_SOCKET
  mov rdi, AF_UNIX ; Unix socket.
  mov rsi, SOCK_STREAM ; Stream oriented.
  mov rdx, 0 ; Automatic protocol.
  syscall

  cmp rax, 0
  jle die

  mov rdi, rax ; Store socket fd in `rdi` for the remainder of the function.

  sub rsp, 112 ; Store struct sockaddr_un on the stack.

  mov WORD [rsp], AF_UNIX ; Set sockaddr_un.sun_family to AF_UNIX
  ; Fill sockaddr_un.sun_path with: &quot;/tmp/.X11-unix/X0&quot;.
  lea rsi, sun_path
  mov r12, rdi ; Save the socket file descriptor in `rdi` in `r12`.
  lea rdi, [rsp + 2]
  cld ; Move forward
  mov ecx, 19 ; Length is 19 with the null terminator.
  rep movsb ; Copy.

  ; Connect to the server: connect(2).
  mov rax, SYSCALL_CONNECT
  mov rdi, r12
  lea rsi, [rsp]
  %define SIZEOF_SOCKADDR_UN 2+108
  mov rdx, SIZEOF_SOCKADDR_UN
  syscall

  cmp rax, 0
  jne die

  mov rax, rdi ; Return the socket fd.

  add rsp, 112
  pop rbp
  ret
</code></pre>
<p>We are ready to talk to the X11 server!</p>
<h2 id="484246098-sending-data-over-the-socket">
  <a class="title" href="#484246098-sending-data-over-the-socket">Sending data over the socket</a>
  <a class="hash-anchor" href="#484246098-sending-data-over-the-socket" aria-hidden="true" onclick="navigator.clipboard.writeText(this.href);"></a>
</h2>
<p>There is the <code>send(2)</code> syscall to do this, but we can keep it simple and use the generic <code>write(2)</code> syscall instead. Either way works.</p>
<pre><code class="language-x86asm">%define SYSCALL_WRITE 1
</code></pre>
<p>The C structure for the handshake in the case of success looks like this:</p>
<pre><code class="language-c">typedef struct {
  u8 order;
  u8 pad1;
  u16 major, minor;
  u16 auth_proto_len, auth_data_len;
  u16 pad2;
  // Optionally, authorization information follow, if `auth_proto_len` and `auth_data_len` are  not 0.
} x11_connection_req_t;
</code></pre>
<p><code>pad*</code> fields can be ignored since they are padding and their value is not read by the server.</p>
<p>For our handshake, we need to set the <code>order</code> to be <code>l</code>, that is, little-endian, since X11 can be told to interpret message as big or little endian. Since x64 is little-endian, we do not want to have a endianness translation layer and so we stick to little-endian.</p>
<p>We also need to set the <code>major</code> field, which is the version, to <code>11</code>. I'll leave it to the reader to guess why.</p>
<p>In C, we would do:</p>
<pre><code class="language-c">  x11_connection_req_t req = {.order = 'l', .major = 11};
</code></pre>
<p>This structure is only 12 bytes long, since we do not use authorization (we leave all subsequent fields after the <code>minor_version</code> as 0).</p>
<p>But since we will have to read the response from the server which is quite big (around 14 KiB during my testing), we will right away reserve a lot of space on the stack, 32 KiB, to be safe:</p>
<pre><code class="language-x86asm">  sub rsp, 1&lt;&lt;15
  mov BYTE [rsp + 0], 'l' ; Set order to 'l'.
  mov WORD [rsp + 2], 11 ; Set major version to 11.
</code></pre>
<p>Then we send it to the server:</p>
<pre><code class="language-x86asm">  ; Send the handshake to the server: write(2).
  mov rax, SYSCALL_WRITE
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, 12
  syscall

  cmp rax, 12 ; Check that all bytes were written.
  jnz die
</code></pre>
<p>After that, we read the server response, which should be at first 8 bytes:</p>
<pre><code class="language-x86asm">  ; Read the server response: read(2).
  ; Use the stack for the read buffer.
  ; The X11 server first replies with 8 bytes. Once these are read, it replies with a much bigger message.
  mov rax, SYSCALL_READ
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, 8
  syscall

  cmp rax, 8 ; Check that the server replied with 8 bytes.
  jnz die

  cmp BYTE [rsp], 1 ; Check that the server sent 'success' (first byte is 1).
  jnz die
</code></pre>
<p>The first byte in the server response is <code>0</code> for failure and <code>1</code> for success (and <code>2</code> for authentication but we will not need it here).</p>
<p>The server sends a big message with a lot of general information, which we will need for later, so we store certain fields in global variables located in the data section.</p>
<p>First we add those variables, each 4 bytes big:</p>
<pre><code class="language-x86asm">section .data

id: dd 0
static id:data

id_base: dd 0
static id_base:data

id_mask: dd 0
static id_mask:data

root_visual_id: dd 0
static root_visual_id:data
</code></pre>
<p>Then we read the server response, and skip over the parts we are not interested in. This boils down to incrementing a pointer by a dynamic value, a few times. Note that since we do not do any checks here, that would be a great attack vector to trigger a stack overflow or such in our program.</p>
<pre><code class="language-x86asm">  ; Read the rest of the server response: read(2).
  ; Use the stack for the read buffer.
  mov rax, SYSCALL_READ
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, 1&lt;&lt;15
  syscall

  cmp rax, 0 ; Check that the server replied with something.
  jle die

  ; Set id_base globally.
  mov edx, DWORD [rsp + 4]
  mov DWORD [id_base], edx

  ; Set id_mask globally.
  mov edx, DWORD [rsp + 8]
  mov DWORD [id_mask], edx

  ; Read the information we need, skip over the rest.
  lea rdi, [rsp] ; Pointer that will skip over some data.
  
  mov cx, WORD [rsp + 16] ; Vendor length (v).
  movzx rcx, cx

  mov al, BYTE [rsp + 21]; Number of formats (n).
  movzx rax, al ; Fill the rest of the register with zeroes to avoid garbage values.
  imul rax, 8 ; sizeof(format) == 8

  add rdi, 32 ; Skip the connection setup
  add rdi, rcx ; Skip over the vendor information (v).

  ; Skip over padding.
  add rdi, 3
  and rdi, -4

  add rdi, rax ; Skip over the format information (n*8).

  mov eax, DWORD [rdi] ; Store (and return) the window root id.

  ; Set the root_visual_id globally.
  mov edx, DWORD [rdi + 32]
  mov DWORD [root_visual_id], edx
</code></pre>
<hr />
<p>A small aside about padding, <a href="https://github.com/gaultier/blog/issues/6">thanks to a perspicacious reader</a>:</p>
<p>How we skip padding is the only bit of smartness we allow ourselves: some fields in the X11 protocol have a variable length. But the X11 protocol counts everything in units of '4 bytes'.</p>
<p>Meaning, if a field is only 5 bytes long, per the protocol, there will be 3 bytes of padding (which should be skipped over by the application), so that the field occupies 2 units of 4 bytes (it is 4 bytes-aligned).</p>
<p>How do we do that then? The specification uses some division and modulo operations, but those are annoying to do in assembly. We can do better.</p>
<p><code>libX11</code> uses this macro:</p>
<pre><code class="language-c">#define ROUNDUP(nbytes, pad) (((nbytes) + ((pad)-1)) &amp; ~(long)((pad)-1))
</code></pre>
<p>And it should be used so:</p>
<pre><code class="language-c">assert(ROUNDUP(0, 4) == 0);
assert(ROUNDUP(1, 4) == 4);
assert(ROUNDUP(2, 4) == 4);
assert(ROUNDUP(3, 4) == 4);
assert(ROUNDUP(4, 4) == 4);
assert(ROUNDUP(5, 4) == 8);
// etc
</code></pre>
<p>This works, but is kind of complex. If we look at this output when compiling this code, we see that <code>gcc</code> smartly optimizes this macro down to:</p>
<pre><code class="language-x86asm">  add     eax, 3
  and     eax, -4
</code></pre>
<p>So we use this form.</p>
<hr />
<p>All together:</p>
<pre><code class="language-x86asm">; Send the handshake to the X11 server and read the returned system information.
; @param rdi The socket file descriptor
; @returns The window root id (uint32_t) in rax.
x11_send_handshake:
static x11_send_handshake:function
  push rbp
  mov rbp, rsp

  sub rsp, 1&lt;&lt;15
  mov BYTE [rsp + 0], 'l' ; Set order to 'l'.
  mov WORD [rsp + 2], 11 ; Set major version to 11.

  ; Send the handshake to the server: write(2).
  mov rax, SYSCALL_WRITE
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, 12
  syscall

  cmp rax, 12 ; Check that all bytes were written.
  jnz die

  ; Read the server response: read(2).
  ; Use the stack for the read buffer.
  ; The X11 server first replies with 8 bytes. Once these are read, it replies with a much bigger message.
  mov rax, SYSCALL_READ
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, 8
  syscall

  cmp rax, 8 ; Check that the server replied with 8 bytes.
  jnz die

  cmp BYTE [rsp], 1 ; Check that the server sent 'success' (first byte is 1).
  jnz die

  ; Read the rest of the server response: read(2).
  ; Use the stack for the read buffer.
  mov rax, SYSCALL_READ
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, 1&lt;&lt;15
  syscall

  cmp rax, 0 ; Check that the server replied with something.
  jle die

  ; Set id_base globally.
  mov edx, DWORD [rsp + 4]
  mov DWORD [id_base], edx

  ; Set id_mask globally.
  mov edx, DWORD [rsp + 8]
  mov DWORD [id_mask], edx

  ; Read the information we need, skip over the rest.
  lea rdi, [rsp] ; Pointer that will skip over some data.
  
  mov cx, WORD [rsp + 16] ; Vendor length (v).
  movzx rcx, cx

  mov al, BYTE [rsp + 21]; Number of formats (n).
  movzx rax, al ; Fill the rest of the register with zeroes to avoid garbage values.
  imul rax, 8 ; sizeof(format) == 8

  add rdi, 32 ; Skip the connection setup
  add rdi, rcx ; Skip over the vendor information (v).

  ; Skip over padding.
  add rdi, 3
  and rdi, -4

  add rdi, rax ; Skip over the format information (n*8).

  mov eax, DWORD [rdi] ; Store (and return) the window root id.

  ; Set the root_visual_id globally.
  mov edx, DWORD [rdi + 32]
  mov DWORD [root_visual_id], edx

  add rsp, 1&lt;&lt;15
  pop rbp
  ret
</code></pre>
<blockquote>
<p>From this point on, I will assume you are familiar with the basics of assembly and X11 and will not go as much into details.</p>
</blockquote>
<h2 id="1309822244-generating-ids">
  <a class="title" href="#1309822244-generating-ids">Generating ids</a>
  <a class="hash-anchor" href="#1309822244-generating-ids" aria-hidden="true" onclick="navigator.clipboard.writeText(this.href);"></a>
</h2>
<p>When creating resources on the server-side, we usually first generate an id on the client side, and send that id to the server when creating the resource.</p>
<p>We store the current id in a global variable and increment it each time a new id is generated.</p>
<p>This is how we do it:</p>
<pre><code class="language-x86asm">; Increment the global id.
; @return The new id.
x11_next_id:
static x11_next_id:function
  push rbp
  mov rbp, rsp

  mov eax, DWORD [id] ; Load global id.

  mov edi, DWORD [id_base] ; Load global id_base.
  mov edx, DWORD [id_mask] ; Load global id_mask.

  ; Return: id_mask &amp; (id) | id_base
  and eax, edx
  or eax, edi

  add DWORD [id], 1 ; Increment id.

  pop rbp
  ret
</code></pre>
<h2 id="4134081642-opening-a-font">
  <a class="title" href="#4134081642-opening-a-font">Opening a font</a>
  <a class="hash-anchor" href="#4134081642-opening-a-font" aria-hidden="true" onclick="navigator.clipboard.writeText(this.href);"></a>
</h2>
<p>To open a font, which is a prerequisite to draw text, we send a message to the server specifying (part of) the name of the font we want, and the server will select a matching font.</p>
<p>To play with another font, you can use <code>xfontsel</code> which displays all the font names that the X11 server knows about.</p>
<p>First, we generate an id for the font locally, and then we send it alongside the font name.</p>
<pre><code class="language-x86asm">; Open the font on the server side.
; @param rdi The socket file descriptor.
; @param esi The font id.
x11_open_font:
static x11_open_font:function
  push rbp
  mov rbp, rsp

  %define OPEN_FONT_NAME_BYTE_COUNT 5
  %define OPEN_FONT_PADDING ((4 - (OPEN_FONT_NAME_BYTE_COUNT % 4)) % 4)
  %define OPEN_FONT_PACKET_U32_COUNT (3 + (OPEN_FONT_NAME_BYTE_COUNT + OPEN_FONT_PADDING) / 4)
  %define X11_OP_REQ_OPEN_FONT 0x2d

  sub rsp, 6*8
  mov DWORD [rsp + 0*4], X11_OP_REQ_OPEN_FONT | (OPEN_FONT_NAME_BYTE_COUNT &lt;&lt; 16)
  mov DWORD [rsp + 1*4], esi
  mov DWORD [rsp + 2*4], OPEN_FONT_NAME_BYTE_COUNT
  mov BYTE [rsp + 3*4 + 0], 'f'
  mov BYTE [rsp + 3*4 + 1], 'i'
  mov BYTE [rsp + 3*4 + 2], 'x'
  mov BYTE [rsp + 3*4 + 3], 'e'
  mov BYTE [rsp + 3*4 + 4], 'd'


  mov rax, SYSCALL_WRITE
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, OPEN_FONT_PACKET_U32_COUNT*4
  syscall

  cmp rax, OPEN_FONT_PACKET_U32_COUNT*4
  jnz die

  add rsp, 6*8

  pop rbp
  ret
</code></pre>
<h2 id="3515439192-creating-a-graphical-context">
  <a class="title" href="#3515439192-creating-a-graphical-context">Creating a graphical context</a>
  <a class="hash-anchor" href="#3515439192-creating-a-graphical-context" aria-hidden="true" onclick="navigator.clipboard.writeText(this.href);"></a>
</h2>
<p>Since an application in X11 can have multiple windows, we first need to create a graphical context containing the general information. When we create a window, we refer to this graphical context by id.</p>
<p>Again, we need to generate an id for the graphical context to be.</p>
<p>X11 stores a hierarchy of windows, so when creating the graphical context, we also need to give it the root window id (i.e. the parent id).</p>
<pre><code class="language-x86asm">; Create a X11 graphical context.
; @param rdi The socket file descriptor.
; @param esi The graphical context id.
; @param edx The window root id.
; @param ecx The font id.
x11_create_gc:
static x11_create_gc:function
  push rbp
  mov rbp, rsp

  sub rsp, 8*8

%define X11_OP_REQ_CREATE_GC 0x37
%define X11_FLAG_GC_BG 0x00000004
%define X11_FLAG_GC_FG 0x00000008
%define X11_FLAG_GC_FONT 0x00004000
%define X11_FLAG_GC_EXPOSE 0x00010000

%define CREATE_GC_FLAGS X11_FLAG_GC_BG | X11_FLAG_GC_FG | X11_FLAG_GC_FONT
%define CREATE_GC_PACKET_FLAG_COUNT 3
%define CREATE_GC_PACKET_U32_COUNT (4 + CREATE_GC_PACKET_FLAG_COUNT)
%define MY_COLOR_RGB 0x0000ffff

  mov DWORD [rsp + 0*4], X11_OP_REQ_CREATE_GC | (CREATE_GC_PACKET_U32_COUNT&lt;&lt;16)
  mov DWORD [rsp + 1*4], esi
  mov DWORD [rsp + 2*4], edx
  mov DWORD [rsp + 3*4], CREATE_GC_FLAGS
  mov DWORD [rsp + 4*4], MY_COLOR_RGB
  mov DWORD [rsp + 5*4], 0
  mov DWORD [rsp + 6*4], ecx

  mov rax, SYSCALL_WRITE
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, CREATE_GC_PACKET_U32_COUNT*4
  syscall

  cmp rax, CREATE_GC_PACKET_U32_COUNT*4
  jnz die
  
  add rsp, 8*8

  pop rbp
  ret
</code></pre>
<h2 id="2863200396-creating-the-window">
  <a class="title" href="#2863200396-creating-the-window">Creating the window</a>
  <a class="hash-anchor" href="#2863200396-creating-the-window" aria-hidden="true" onclick="navigator.clipboard.writeText(this.href);"></a>
</h2>
<p>We can now create the window, which refers to the freshly created graphical context.
We also provide the desired x and y coordinates of the window, as well as the desired dimensions (width and height).</p>
<p>Note that those are simply hints and the resulting window may well have different coordinates and dimensions, for example when using a tiling window manager, or when resizing the window.</p>
<pre><code class="language-x86asm">; Create the X11 window.
; @param rdi The socket file descriptor.
; @param esi The new window id.
; @param edx The window root id.
; @param ecx The root visual id.
; @param r8d Packed x and y.
; @param r9d Packed w and h.
x11_create_window:
static x11_create_window:function
  push rbp
  mov rbp, rsp

  %define X11_OP_REQ_CREATE_WINDOW 0x01
  %define X11_FLAG_WIN_BG_COLOR 0x00000002
  %define X11_EVENT_FLAG_KEY_RELEASE 0x0002
  %define X11_EVENT_FLAG_EXPOSURE 0x8000
  %define X11_FLAG_WIN_EVENT 0x00000800
  
  %define CREATE_WINDOW_FLAG_COUNT 2
  %define CREATE_WINDOW_PACKET_U32_COUNT (8 + CREATE_WINDOW_FLAG_COUNT)
  %define CREATE_WINDOW_BORDER 1
  %define CREATE_WINDOW_GROUP 1

  sub rsp, 12*8

  mov DWORD [rsp + 0*4], X11_OP_REQ_CREATE_WINDOW | (CREATE_WINDOW_PACKET_U32_COUNT &lt;&lt; 16)
  mov DWORD [rsp + 1*4], esi
  mov DWORD [rsp + 2*4], edx
  mov DWORD [rsp + 3*4], r8d
  mov DWORD [rsp + 4*4], r9d
  mov DWORD [rsp + 5*4], CREATE_WINDOW_GROUP | (CREATE_WINDOW_BORDER &lt;&lt; 16)
  mov DWORD [rsp + 6*4], ecx
  mov DWORD [rsp + 7*4], X11_FLAG_WIN_BG_COLOR | X11_FLAG_WIN_EVENT
  mov DWORD [rsp + 8*4], 0
  mov DWORD [rsp + 9*4], X11_EVENT_FLAG_KEY_RELEASE | X11_EVENT_FLAG_EXPOSURE


  mov rax, SYSCALL_WRITE
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, CREATE_WINDOW_PACKET_U32_COUNT*4
  syscall

  cmp rax, CREATE_WINDOW_PACKET_U32_COUNT*4
  jnz die

  add rsp, 12*8

  pop rbp
  ret
</code></pre>
<h2 id="577694983-mapping-the-window">
  <a class="title" href="#577694983-mapping-the-window">Mapping the window</a>
  <a class="hash-anchor" href="#577694983-mapping-the-window" aria-hidden="true" onclick="navigator.clipboard.writeText(this.href);"></a>
</h2>
<p>If you are following along at home, and just ran the program, you have realized nothing is displayed.</p>
<p>That is because X11 does not show the window until we have mapped it. This is a simple message to send:</p>
<pre><code class="language-x86asm">; Map a X11 window.
; @param rdi The socket file descriptor.
; @param esi The window id.
x11_map_window:
static x11_map_window:function
  push rbp
  mov rbp, rsp

  sub rsp, 16

  %define X11_OP_REQ_MAP_WINDOW 0x08
  mov DWORD [rsp + 0*4], X11_OP_REQ_MAP_WINDOW | (2&lt;&lt;16)
  mov DWORD [rsp + 1*4], esi

  mov rax, SYSCALL_WRITE
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, 2*4
  syscall

  cmp rax, 2*4
  jnz die

  add rsp, 16

  pop rbp
  ret
</code></pre>
<p>We now have a black window:</p>
<p><img src="x11_x64_black_window.png" alt="Black window" /></p>
<p>Yay!</p>
<h2 id="677275119-polling-for-server-messages">
  <a class="title" href="#677275119-polling-for-server-messages">Polling for server messages</a>
  <a class="hash-anchor" href="#677275119-polling-for-server-messages" aria-hidden="true" onclick="navigator.clipboard.writeText(this.href);"></a>
</h2>
<p>We would like to draw text in the window now, but we have to wait for the <code>Expose</code> event to be sent to us, which means that the window is visible, to be able to start drawing on it.</p>
<p>We want to listen for all server messages actually, be it errors or events, for example when the user presses a key on the keyboard.</p>
<p>If we do a simple blocking <code>read(2)</code>, but the server sends nothing, the program will appear not responding. Not good.
The solution is to use the <code>poll(2)</code> system call to be awoken by the operating system whenever there is data to be read on the socket, a la NodeJS or Nginx.</p>
<blockquote>
<p>An shrewd reader has pointed out that we could simply <code>read</code> from the socket in a loop, since we only have one, possibly with a timeout. Linux, and perhaps others, support setting a read timeout on a socket with <code>setsockopt(2)</code>. But I will keep this version in this article since this is the original one. Feel free to experiment with the alternative at home!</p>
</blockquote>
<p>First, we need to mark the socket as 'non-blocking' since it is by default in blocking mode:</p>
<pre><code class="language-x86asm">; Set a file descriptor in non-blocking mode.
; @param rdi The file descriptor.
set_fd_non_blocking:
static set_fd_non_blocking:function
  push rbp
  mov rbp, rsp

  mov rax, SYSCALL_FCNTL
  mov rdi, rdi 
  mov rsi, F_GETFL
  mov rdx, 0
  syscall

  cmp rax, 0
  jl die

  ; `or` the current file status flag with O_NONBLOCK.
  mov rdx, rax
  or rdx, O_NONBLOCK

  mov rax, SYSCALL_FCNTL
  mov rdi, rdi 
  mov rsi, F_SETFL
  mov rdx, rdx
  syscall

  cmp rax, 0
  jl die

  pop rbp
  ret
</code></pre>
<p>Then, we write a small function to read data on the socket. For simplicity, we only read 32 bytes of data, because most messages from X11 are of this size. We also return the first byte which contains the event type.</p>
<pre><code class="language-x86asm">; Read the X11 server reply.
; @return The message code in al.
x11_read_reply:
static x11_read_reply:function
  push rbp
  mov rbp, rsp

  sub rsp, 32
  
  mov rax, SYSCALL_READ
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, 32
  syscall

  cmp rax, 1
  jle die

  mov al, BYTE [rsp]

  add rsp, 32

  pop rbp
  ret
</code></pre>
<p>We now can poll. If an error occurs or the other side has closed their end of the socket, we exit the program.</p>
<pre><code class="language-x86asm">; Poll indefinitely messages from the X11 server with poll(2).
; @param rdi The socket file descriptor.
; @param esi The window id.
; @param edx The gc id.
poll_messages:
static poll_messages:function
  push rbp
  mov rbp, rsp

  sub rsp, 32

  %define POLLIN 0x001
  %define POLLPRI 0x002
  %define POLLOUT 0x004
  %define POLLERR  0x008
  %define POLLHUP  0x010
  %define POLLNVAL 0x020

  mov DWORD [rsp + 0*4], edi
  mov DWORD [rsp + 1*4], POLLIN

  mov DWORD [rsp + 16], esi ; window id
  mov DWORD [rsp + 20], edx ; gc id

  .loop:
    mov rax, SYSCALL_POLL
    lea rdi, [rsp]
    mov rsi, 1
    mov rdx, -1
    syscall

    cmp rax, 0
    jle die

    cmp DWORD [rsp + 2*4], POLLERR  
    je die

    cmp DWORD [rsp + 2*4], POLLHUP  
    je die

    mov rdi, [rsp + 0*4]
    call x11_read_reply

    jmp .loop

  add rsp, 32
  pop rbp
  ret
</code></pre>
<h2 id="3433791877-drawing-text">
  <a class="title" href="#3433791877-drawing-text">Drawing text</a>
  <a class="hash-anchor" href="#3433791877-drawing-text" aria-hidden="true" onclick="navigator.clipboard.writeText(this.href);"></a>
</h2>
<p>At last, we can draw text. The small difficulty here is that the text is of unknown length in the general case, so we have to compute the size of the X11 message, including the padding at the end. So far, we only had messages of fixed size.</p>
<p>The official documentation has formulas to compute those values.</p>
<pre><code class="language-x86asm">; Draw text in a X11 window with server-side text rendering.
; @param rdi The socket file descriptor.
; @param rsi The text string.
; @param edx The text string length in bytes.
; @param ecx The window id.
; @param r8d The gc id.
; @param r9d Packed x and y.
x11_draw_text:
static x11_draw_text:function
  push rbp
  mov rbp, rsp

  sub rsp, 1024

  mov DWORD [rsp + 1*4], ecx ; Store the window id directly in the packet data on the stack.
  mov DWORD [rsp + 2*4], r8d ; Store the gc id directly in the packet data on the stack.
  mov DWORD [rsp + 3*4], r9d ; Store x, y directly in the packet data on the stack.

  mov r8d, edx ; Store the string length in r8 since edx will be overwritten next.
  mov QWORD [rsp + 1024 - 8], rdi ; Store the socket file descriptor on the stack to free the register.

  ; Compute padding and packet u32 count with division and modulo 4.
  mov eax, edx ; Put dividend in eax.
  mov ecx, 4 ; Put divisor in ecx.
  cdq ; Sign extend.
  idiv ecx ; Compute eax / ecx, and put the remainder (i.e. modulo) in edx.
  ; LLVM optimizer magic: `(4-x)%4 == -x &amp; 3`, for some reason.
  neg edx
  and edx, 3
  mov r9d, edx ; Store padding in r9.

  mov eax, r8d 
  add eax, r9d
  shr eax, 2 ; Compute: eax /= 4
  add eax, 4 ; eax now contains the packet u32 count.


  %define X11_OP_REQ_IMAGE_TEXT8 0x4c
  mov DWORD [rsp + 0*4], r8d
  shl DWORD [rsp + 0*4], 8
  or DWORD [rsp + 0*4], X11_OP_REQ_IMAGE_TEXT8
  mov ecx, eax
  shl ecx, 16
  or [rsp + 0*4], ecx

  ; Copy the text string into the packet data on the stack.
  mov rsi, rsi ; Source string in rsi.
  lea rdi, [rsp + 4*4] ; Destination
  cld ; Move forward
  mov ecx, r8d ; String length.
  rep movsb ; Copy.

  mov rdx, rax ; packet u32 count
  imul rdx, 4
  mov rax, SYSCALL_WRITE
  mov rdi, QWORD [rsp + 1024 - 8] ; fd
  lea rsi, [rsp]
  syscall

  cmp rax, rdx
  jnz die

  add rsp, 1024

  pop rbp
  ret
</code></pre>
<p>We then call this function inside the polling loop, and we store the 'exposed' state in a boolean on the stack to know whether we should render the text or not:</p>
<pre><code class="language-x86asm">    %define X11_EVENT_EXPOSURE 0xc
    cmp eax, X11_EVENT_EXPOSURE
    jnz .received_other_event

    .received_exposed_event:
    mov BYTE [rsp + 24], 1 ; Mark as exposed.

    .received_other_event:

    cmp BYTE [rsp + 24], 1 ; exposed?
    jnz .loop

    .draw_text:
      mov rdi, [rsp + 0*4] ; socket fd
      lea rsi, [hello_world] ; string
      mov edx, 13 ; length
      mov ecx, [rsp + 16] ; window id
      mov r8d, [rsp + 20] ; gc id
      mov r9d, 100 ; x
      shl r9d, 16
      or r9d, 100 ; y
      call x11_draw_text
</code></pre>
<p>Finally, we see our <code>Hello, world!</code> text displayed inside the window:</p>
<p><img src="x11_x64_final.png" alt="Result" /></p>
<h2 id="1770781618-the-end">
  <a class="title" href="#1770781618-the-end">The end</a>
  <a class="hash-anchor" href="#1770781618-the-end" aria-hidden="true" onclick="navigator.clipboard.writeText(this.href);"></a>
</h2>
<p>Wow, that was a lot. But we did it! We wrote a (albeit simplistic) GUI program in pure assembly, no dependencies, and that's just 600 lines of code in the end.</p>
<p>How did we fare on the executable size part?</p>
<ul>
<li>With debug information: 10744 bytes (10 KiB)</li>
<li>Without debug information (stripped): 8592 bytes (8 KiB)</li>
<li>Stripped and <code>OMAGIC</code> (<code>--omagic</code> linker flag, from the man page: <code>Set the text and data sections to be readable and writable.  Also, do not page-align the data segment</code>): 1776 bytes (1 KiB)</li>
</ul>
<p>Not too shabby, a GUI program in 1 KiB.</p>
<p>Where to go from there?</p>
<ul>
<li>We could move text rendering client-side. Doing it server-side has lots of limitations.</li>
<li>We could add shape rendering, such as quads and circles</li>
<li>We could listen to keyboard and mouse events (the polling loop is easy to extend to do that)</li>
</ul>
<p>I hope that you had as much fun as I did!</p>
<h2 id="1512890027-addendum-the-full-code">
  <a class="title" href="#1512890027-addendum-the-full-code">Addendum: the full code</a>
  <a class="hash-anchor" href="#1512890027-addendum-the-full-code" aria-hidden="true" onclick="navigator.clipboard.writeText(this.href);"></a>
</h2>
<details>
  <summary>The full code</summary>
<pre><code class="language-x86asm">; Build with: nasm -f elf64 -g main.nasm &amp;&amp; ld main.o -static -o main 

BITS 64 ; 64 bits.
CPU X64 ; Target the x86_64 family of CPUs.

section .rodata

sun_path: db &quot;/tmp/.X11-unix/X0&quot;, 0
static sun_path:data

hello_world: db &quot;Hello, world!&quot;
static hello_world:data

section .data

id: dd 0
static id:data

id_base: dd 0
static id_base:data

id_mask: dd 0
static id_mask:data

root_visual_id: dd 0
static root_visual_id:data


section .text

%define AF_UNIX 1
%define SOCK_STREAM 1

%define SYSCALL_READ 0
%define SYSCALL_WRITE 1
%define SYSCALL_POLL 7
%define SYSCALL_SOCKET 41
%define SYSCALL_CONNECT 42
%define SYSCALL_EXIT 60
%define SYSCALL_FCNTL 72

; Create a UNIX domain socket and connect to the X11 server.
; @returns The socket file descriptor.
x11_connect_to_server:
static x11_connect_to_server:function
  push rbp
  mov rbp, rsp 

  ; Open a Unix socket: socket(2).
  mov rax, SYSCALL_SOCKET
  mov rdi, AF_UNIX ; Unix socket.
  mov rsi, SOCK_STREAM ; Stream oriented.
  mov rdx, 0 ; Automatic protocol.
  syscall

  cmp rax, 0
  jle die

  mov rdi, rax ; Store socket fd in `rdi` for the remainder of the function.

  sub rsp, 112 ; Store struct sockaddr_un on the stack.

  mov WORD [rsp], AF_UNIX ; Set sockaddr_un.sun_family to AF_UNIX
  ; Fill sockaddr_un.sun_path with: &quot;/tmp/.X11-unix/X0&quot;.
  lea rsi, sun_path
  mov r12, rdi ; Save the socket file descriptor in `rdi` in `r12`.
  lea rdi, [rsp + 2]
  cld ; Move forward
  mov ecx, 19 ; Length is 19 with the null terminator.
  rep movsb ; Copy.

  ; Connect to the server: connect(2).
  mov rax, SYSCALL_CONNECT
  mov rdi, r12
  lea rsi, [rsp]
  %define SIZEOF_SOCKADDR_UN 2+108
  mov rdx, SIZEOF_SOCKADDR_UN
  syscall

  cmp rax, 0
  jne die

  mov rax, rdi ; Return the socket fd.

  add rsp, 112
  pop rbp
  ret

; Send the handshake to the X11 server and read the returned system information.
; @param rdi The socket file descriptor
; @returns The window root id (uint32_t) in rax.
x11_send_handshake:
static x11_send_handshake:function
  push rbp
  mov rbp, rsp

  sub rsp, 1&lt;&lt;15
  mov BYTE [rsp + 0], 'l' ; Set order to 'l'.
  mov WORD [rsp + 2], 11 ; Set major version to 11.

  ; Send the handshake to the server: write(2).
  mov rax, SYSCALL_WRITE
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, 12
  syscall

  cmp rax, 12 ; Check that all bytes were written.
  jnz die

  ; Read the server response: read(2).
  ; Use the stack for the read buffer.
  ; The X11 server first replies with 8 bytes. Once these are read, it replies with a much bigger message.
  mov rax, SYSCALL_READ
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, 8
  syscall

  cmp rax, 8 ; Check that the server replied with 8 bytes.
  jnz die

  cmp BYTE [rsp], 1 ; Check that the server sent 'success' (first byte is 1).
  jnz die

  ; Read the rest of the server response: read(2).
  ; Use the stack for the read buffer.
  mov rax, SYSCALL_READ
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, 1&lt;&lt;15
  syscall

  cmp rax, 0 ; Check that the server replied with something.
  jle die

  ; Set id_base globally.
  mov edx, DWORD [rsp + 4]
  mov DWORD [id_base], edx

  ; Set id_mask globally.
  mov edx, DWORD [rsp + 8]
  mov DWORD [id_mask], edx

  ; Read the information we need, skip over the rest.
  lea rdi, [rsp] ; Pointer that will skip over some data.
  
  mov cx, WORD [rsp + 16] ; Vendor length (v).
  movzx rcx, cx

  mov al, BYTE [rsp + 21]; Number of formats (n).
  movzx rax, al ; Fill the rest of the register with zeroes to avoid garbage values.
  imul rax, 8 ; sizeof(format) == 8

  add rdi, 32 ; Skip the connection setup

  ; Skip over padding.
  add rdi, 3
  and rdi, -4

  add rdi, rcx ; Skip over the vendor information (v).
  add rdi, rax ; Skip over the format information (n*8).

  mov eax, DWORD [rdi] ; Store (and return) the window root id.

  ; Set the root_visual_id globally.
  mov edx, DWORD [rdi + 32]
  mov DWORD [root_visual_id], edx

  add rsp, 1&lt;&lt;15
  pop rbp
  ret

; Increment the global id.
; @return The new id.
x11_next_id:
static x11_next_id:function
  push rbp
  mov rbp, rsp

  mov eax, DWORD [id] ; Load global id.

  mov edi, DWORD [id_base] ; Load global id_base.
  mov edx, DWORD [id_mask] ; Load global id_mask.

  ; Return: id_mask &amp; (id) | id_base
  and eax, edx
  or eax, edi

  add DWORD [id], 1 ; Increment id.

  pop rbp
  ret

; Open the font on the server side.
; @param rdi The socket file descriptor.
; @param esi The font id.
x11_open_font:
static x11_open_font:function
  push rbp
  mov rbp, rsp

  %define OPEN_FONT_NAME_BYTE_COUNT 5
  %define OPEN_FONT_PADDING ((4 - (OPEN_FONT_NAME_BYTE_COUNT % 4)) % 4)
  %define OPEN_FONT_PACKET_U32_COUNT (3 + (OPEN_FONT_NAME_BYTE_COUNT + OPEN_FONT_PADDING) / 4)
  %define X11_OP_REQ_OPEN_FONT 0x2d

  sub rsp, 6*8
  mov DWORD [rsp + 0*4], X11_OP_REQ_OPEN_FONT | (OPEN_FONT_NAME_BYTE_COUNT &lt;&lt; 16)
  mov DWORD [rsp + 1*4], esi
  mov DWORD [rsp + 2*4], OPEN_FONT_NAME_BYTE_COUNT
  mov BYTE [rsp + 3*4 + 0], 'f'
  mov BYTE [rsp + 3*4 + 1], 'i'
  mov BYTE [rsp + 3*4 + 2], 'x'
  mov BYTE [rsp + 3*4 + 3], 'e'
  mov BYTE [rsp + 3*4 + 4], 'd'


  mov rax, SYSCALL_WRITE
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, OPEN_FONT_PACKET_U32_COUNT*4
  syscall

  cmp rax, OPEN_FONT_PACKET_U32_COUNT*4
  jnz die

  add rsp, 6*8

  pop rbp
  ret

; Create a X11 graphical context.
; @param rdi The socket file descriptor.
; @param esi The graphical context id.
; @param edx The window root id.
; @param ecx The font id.
x11_create_gc:
static x11_create_gc:function
  push rbp
  mov rbp, rsp

  sub rsp, 8*8

%define X11_OP_REQ_CREATE_GC 0x37
%define X11_FLAG_GC_BG 0x00000004
%define X11_FLAG_GC_FG 0x00000008
%define X11_FLAG_GC_FONT 0x00004000
%define X11_FLAG_GC_EXPOSE 0x00010000

%define CREATE_GC_FLAGS X11_FLAG_GC_BG | X11_FLAG_GC_FG | X11_FLAG_GC_FONT
%define CREATE_GC_PACKET_FLAG_COUNT 3
%define CREATE_GC_PACKET_U32_COUNT (4 + CREATE_GC_PACKET_FLAG_COUNT)
%define MY_COLOR_RGB 0x0000ffff

  mov DWORD [rsp + 0*4], X11_OP_REQ_CREATE_GC | (CREATE_GC_PACKET_U32_COUNT&lt;&lt;16)
  mov DWORD [rsp + 1*4], esi
  mov DWORD [rsp + 2*4], edx
  mov DWORD [rsp + 3*4], CREATE_GC_FLAGS
  mov DWORD [rsp + 4*4], MY_COLOR_RGB
  mov DWORD [rsp + 5*4], 0
  mov DWORD [rsp + 6*4], ecx

  mov rax, SYSCALL_WRITE
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, CREATE_GC_PACKET_U32_COUNT*4
  syscall

  cmp rax, CREATE_GC_PACKET_U32_COUNT*4
  jnz die
  
  add rsp, 8*8

  pop rbp
  ret

; Create the X11 window.
; @param rdi The socket file descriptor.
; @param esi The new window id.
; @param edx The window root id.
; @param ecx The root visual id.
; @param r8d Packed x and y.
; @param r9d Packed w and h.
x11_create_window:
static x11_create_window:function
  push rbp
  mov rbp, rsp

  %define X11_OP_REQ_CREATE_WINDOW 0x01
  %define X11_FLAG_WIN_BG_COLOR 0x00000002
  %define X11_EVENT_FLAG_KEY_RELEASE 0x0002
  %define X11_EVENT_FLAG_EXPOSURE 0x8000
  %define X11_FLAG_WIN_EVENT 0x00000800
  
  %define CREATE_WINDOW_FLAG_COUNT 2
  %define CREATE_WINDOW_PACKET_U32_COUNT (8 + CREATE_WINDOW_FLAG_COUNT)
  %define CREATE_WINDOW_BORDER 1
  %define CREATE_WINDOW_GROUP 1

  sub rsp, 12*8

  mov DWORD [rsp + 0*4], X11_OP_REQ_CREATE_WINDOW | (CREATE_WINDOW_PACKET_U32_COUNT &lt;&lt; 16)
  mov DWORD [rsp + 1*4], esi
  mov DWORD [rsp + 2*4], edx
  mov DWORD [rsp + 3*4], r8d
  mov DWORD [rsp + 4*4], r9d
  mov DWORD [rsp + 5*4], CREATE_WINDOW_GROUP | (CREATE_WINDOW_BORDER &lt;&lt; 16)
  mov DWORD [rsp + 6*4], ecx
  mov DWORD [rsp + 7*4], X11_FLAG_WIN_BG_COLOR | X11_FLAG_WIN_EVENT
  mov DWORD [rsp + 8*4], 0
  mov DWORD [rsp + 9*4], X11_EVENT_FLAG_KEY_RELEASE | X11_EVENT_FLAG_EXPOSURE


  mov rax, SYSCALL_WRITE
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, CREATE_WINDOW_PACKET_U32_COUNT*4
  syscall

  cmp rax, CREATE_WINDOW_PACKET_U32_COUNT*4
  jnz die

  add rsp, 12*8

  pop rbp
  ret

; Map a X11 window.
; @param rdi The socket file descriptor.
; @param esi The window id.
x11_map_window:
static x11_map_window:function
  push rbp
  mov rbp, rsp

  sub rsp, 16

  %define X11_OP_REQ_MAP_WINDOW 0x08
  mov DWORD [rsp + 0*4], X11_OP_REQ_MAP_WINDOW | (2&lt;&lt;16)
  mov DWORD [rsp + 1*4], esi

  mov rax, SYSCALL_WRITE
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, 2*4
  syscall

  cmp rax, 2*4
  jnz die

  add rsp, 16

  pop rbp
  ret

; Read the X11 server reply.
; @return The message code in al.
x11_read_reply:
static x11_read_reply:function
  push rbp
  mov rbp, rsp

  sub rsp, 32

  mov rax, SYSCALL_READ
  mov rdi, rdi
  lea rsi, [rsp]
  mov rdx, 32
  syscall

  cmp rax, 1
  jle die

  mov al, BYTE [rsp]

  add rsp, 32

  pop rbp
  ret

die:
  mov rax, SYSCALL_EXIT
  mov rdi, 1
  syscall


; Set a file descriptor in non-blocking mode.
; @param rdi The file descriptor.
set_fd_non_blocking:
static set_fd_non_blocking:function
  push rbp
  mov rbp, rsp

  %define F_GETFL 3
  %define F_SETFL 4

  %define O_NONBLOCK 2048

  mov rax, SYSCALL_FCNTL
  mov rdi, rdi 
  mov rsi, F_GETFL
  mov rdx, 0
  syscall

  cmp rax, 0
  jl die

  ; `or` the current file status flag with O_NONBLOCK.
  mov rdx, rax
  or rdx, O_NONBLOCK

  mov rax, SYSCALL_FCNTL
  mov rdi, rdi 
  mov rsi, F_SETFL
  mov rdx, rdx
  syscall

  cmp rax, 0
  jl die

  pop rbp
  ret

; Poll indefinitely messages from the X11 server with poll(2).
; @param rdi The socket file descriptor.
; @param esi The window id.
; @param edx The gc id.
poll_messages:
static poll_messages:function
  push rbp
  mov rbp, rsp

  sub rsp, 32

  %define POLLIN 0x001
  %define POLLPRI 0x002
  %define POLLOUT 0x004
  %define POLLERR  0x008
  %define POLLHUP  0x010
  %define POLLNVAL 0x020

  mov DWORD [rsp + 0*4], edi
  mov DWORD [rsp + 1*4], POLLIN

  mov DWORD [rsp + 16], esi ; window id
  mov DWORD [rsp + 20], edx ; gc id
  mov BYTE [rsp + 24], 0 ; exposed? (boolean)

  .loop:
    mov rax, SYSCALL_POLL
    lea rdi, [rsp]
    mov rsi, 1
    mov rdx, -1
    syscall

    cmp rax, 0
    jle die

    cmp DWORD [rsp + 2*4], POLLERR  
    je die

    cmp DWORD [rsp + 2*4], POLLHUP  
    je die

    mov rdi, [rsp + 0*4]
    call x11_read_reply

    %define X11_EVENT_EXPOSURE 0xc
    cmp eax, X11_EVENT_EXPOSURE
    jnz .received_other_event

    .received_exposed_event:
    mov BYTE [rsp + 24], 1 ; Mark as exposed.

    .received_other_event:

    cmp BYTE [rsp + 24], 1 ; exposed?
    jnz .loop

    .draw_text:
      mov rdi, [rsp + 0*4] ; socket fd
      lea rsi, [hello_world] ; string
      mov edx, 13 ; length
      mov ecx, [rsp + 16] ; window id
      mov r8d, [rsp + 20] ; gc id
      mov r9d, 100 ; x
      shl r9d, 16
      or r9d, 100 ; y
      call x11_draw_text


    jmp .loop


  add rsp, 32
  pop rbp
  ret

; Draw text in a X11 window with server-side text rendering.
; @param rdi The socket file descriptor.
; @param rsi The text string.
; @param edx The text string length in bytes.
; @param ecx The window id.
; @param r8d The gc id.
; @param r9d Packed x and y.
x11_draw_text:
static x11_draw_text:function
  push rbp
  mov rbp, rsp

  sub rsp, 1024

  mov DWORD [rsp + 1*4], ecx ; Store the window id directly in the packet data on the stack.
  mov DWORD [rsp + 2*4], r8d ; Store the gc id directly in the packet data on the stack.
  mov DWORD [rsp + 3*4], r9d ; Store x, y directly in the packet data on the stack.

  mov r8d, edx ; Store the string length in r8 since edx will be overwritten next.
  mov QWORD [rsp + 1024 - 8], rdi ; Store the socket file descriptor on the stack to free the register.

  ; Compute padding and packet u32 count with division and modulo 4.
  mov eax, edx ; Put dividend in eax.
  mov ecx, 4 ; Put divisor in ecx.
  cdq ; Sign extend.
  idiv ecx ; Compute eax / ecx, and put the remainder (i.e. modulo) in edx.
  ; LLVM optimizer magic: `(4-x)%4 == -x &amp; 3`, for some reason.
  neg edx
  and edx, 3
  mov r9d, edx ; Store padding in r9.

  mov eax, r8d 
  add eax, r9d
  shr eax, 2 ; Compute: eax /= 4
  add eax, 4 ; eax now contains the packet u32 count.


  %define X11_OP_REQ_IMAGE_TEXT8 0x4c
  mov DWORD [rsp + 0*4], r8d
  shl DWORD [rsp + 0*4], 8
  or DWORD [rsp + 0*4], X11_OP_REQ_IMAGE_TEXT8
  mov ecx, eax
  shl ecx, 16
  or [rsp + 0*4], ecx

  ; Copy the text string into the packet data on the stack.
  mov rsi, rsi ; Source string in rsi.
  lea rdi, [rsp + 4*4] ; Destination
  cld ; Move forward
  mov ecx, r8d ; String length.
  rep movsb ; Copy.

  mov rdx, rax ; packet u32 count
  imul rdx, 4
  mov rax, SYSCALL_WRITE
  mov rdi, QWORD [rsp + 1024 - 8] ; fd
  lea rsi, [rsp]
  syscall

  cmp rax, rdx
  jnz die

  add rsp, 1024

  pop rbp
  ret

_start:
global _start:function
  call x11_connect_to_server
  mov r15, rax ; Store the socket file descriptor in r15.

  mov rdi, rax
  call x11_send_handshake

  mov r12d, eax ; Store the window root id in r12.

  call x11_next_id
  mov r13d, eax ; Store the gc_id in r13.

  call x11_next_id
  mov r14d, eax ; Store the font_id in r14.

  mov rdi, r15
  mov esi, r14d
  call x11_open_font


  mov rdi, r15
  mov esi, r13d
  mov edx, r12d
  mov ecx, r14d
  call x11_create_gc

  call x11_next_id
  
  mov ebx, eax ; Store the window id in ebx.

  mov rdi, r15 ; socket fd
  mov esi, eax
  mov edx, r12d
  mov ecx, [root_visual_id]
  mov r8d, 200 | (200 &lt;&lt; 16) ; x and y are 200
  %define WINDOW_W 800
  %define WINDOW_H 600
  mov r9d, WINDOW_W | (WINDOW_H &lt;&lt; 16)
  call x11_create_window

  mov rdi, r15 ; socket fd
  mov esi, ebx
  call x11_map_window

  mov rdi, r15 ; socket fd
  call set_fd_non_blocking

  mov rdi, r15 ; socket fd
  mov esi, ebx ; window id
  mov edx, r13d ; gc id
  call poll_messages

  ; The end.
  mov rax, SYSCALL_EXIT
  mov rdi, 0
  syscall
</code></pre>
</details>
<p><a href="/blog"> ⏴ Back to all articles</a></p>

<blockquote id="donate">
  <p>If you enjoy what you're reading, you want to support me, and can afford it: <a href="https://paypal.me/philigaultier?country.x=DE&locale.x=en_US">Support me</a>. That allows me to write more cool articles!</p>
</blockquote>

<blockquote>
  <p>
    This blog is <a href="https://github.com/gaultier/blog">open-source</a>!
    If you find a problem, please open a Github issue.
    The content of this blog as well as the code snippets are under the <a href="https://en.wikipedia.org/wiki/BSD_licenses#3-clause_license_(%22BSD_License_2.0%22,_%22Revised_BSD_License%22,_%22New_BSD_License%22,_or_%22Modified_BSD_License%22)">BSD-3 License</a> which I also usually use for all my personal projects. It's basically free for every use but you have to mention me as the original author.
  </p>
</blockquote>

</div>
</body>
</html>