1.26.5 Array of pointers to strings

Here is an example of an array of pointers:

Listing 1.236: Get month name

1
#include <stdio.h> // include standard I/O header
2

3
const char* month1[] = { // array of pointers to constant strings
4
    "January", "February", "March", "April", "May", "June",   // first six months
5
    "July", "August", "September", "October", "November", "December" // last six months
6
};
7

8
// In the range 0..11
9
const char* get_month1 (int month) { // function that receives a month index and returns its name
10
    return month1[month]; // return the pointer to the string at the given index
11
};

x64

Listing 1.237: Optimizing MSVC 2013 x64

1
_DATA SEGMENT
2
month1  DQ FLAT:$SG3122        ; pointer to "January"
3
        DQ FLAT:$SG3123        ; pointer to "February"
4
        DQ FLAT:$SG3124        ; pointer to "March"
5
        DQ FLAT:$SG3125        ; pointer to "April"
6
        DQ FLAT:$SG3126        ; pointer to "May"
7
        DQ FLAT:$SG3127        ; pointer to "June"
8
        DQ FLAT:$SG3128        ; pointer to "July"
9
        DQ FLAT:$SG3129        ; pointer to "August"
10
        DQ FLAT:$SG3130        ; pointer to "September"
11
        DQ FLAT:$SG3131        ; pointer to "October"
12
        DQ FLAT:$SG3132        ; pointer to "November"
13
        DQ FLAT:$SG3133        ; pointer to "December"
14

15
$SG3122 DB 'January', 00H      ; null-terminated string "January"
16
$SG3123 DB 'February', 00H     ; null-terminated string "February"
17
$SG3124 DB 'March', 00H        ; null-terminated string "March"
18
$SG3125 DB 'April', 00H        ; null-terminated string "April"
19
$SG3126 DB 'May', 00H          ; null-terminated string "May"
20
$SG3127 DB 'June', 00H         ; null-terminated string "June"
21
$SG3128 DB 'July', 00H         ; null-terminated string "July"
22
$SG3129 DB 'August', 00H       ; null-terminated string "August"
23
$SG3130 DB 'September', 00H    ; null-terminated string "September"
24
$SG3131 DB 'October', 00H      ; null-terminated string "October"
25
$SG3132 DB 'November', 00H     ; null-terminated string "November"
26
$SG3133 DB 'December', 00H     ; null-terminated string "December"
27
_DATA ENDS
28

29
_month$ = 8                         ; stack offset of the 'month' argument
30
get_month1 PROC
31
    movsxd  rax, ecx                ; sign-extend ECX (month index, 32-bit int) into RAX (64-bit)
32
    lea     rcx, OFFSET FLAT:month1 ; load base address of the pointer table into RCX
33
    mov     rax, QWORD PTR [rcx+rax*8] ; load pointer: base + month*8 (each pointer = 8 bytes in 64-bit)
34
    ret     0                       ; return the pointer in RAX
35
get_month1 ENDP

The first instruction MOVSXD moves the 32-bit value from ECX (which holds the month argument) into RAX with sign-extension (because the month argument is of type int). After that it loads the address of the pointer table into RCX, and then the input value (month) is multiplied by 8 and added to that address — because in a 64-bit environment each pointer takes 8 bytes, so we must multiply by 8 to reach the correct element.

Optimizing GCC 4.9 x64

1
get_month1:
2
    movsx   rdi, edi                    ; sign-extend EDI (month index) into RDI (64-bit)
3
    mov     rax, QWORD PTR month1[0+rdi*8] ; load pointer from table: base + month*8
4
    ret                                 ; return the pointer in RAX

32-bit MSVC

Listing 1.239: Optimizing MSVC 2013 x86

1
_month$ = 8                             ; stack offset of the 'month' argument
2
_get_month1 PROC
3
    mov eax, DWORD PTR _month$[esp-4]   ; load the month index from the stack into EAX
4
    mov eax, DWORD PTR _month1[eax*4]   ; load pointer from table: base + month*4 (each pointer = 4 bytes in 32-bit)
5
    ret 0                               ; return the pointer in EAX
6
_get_month1 ENDP

Here there is no sign-extension to 64-bit, and the multiplication is by 4 because each pointer in 32-bit is 4 bytes.

32-bit ARM

ARM in ARM mode

1
get_month1 PROC
2
    LDR r1,|L0.100|         ; load address of the pointer table into R1
3
    LDR r0,[r1,r0,LSL #2]   ; load pointer: base(R1) + month(R0)*4 (left shift by 2 = multiply by 4)
4
    BX lr                   ; return (branch to link register)
5
ENDP
6

7
|L0.100|
8
DCD ||.data||               ; address of the data section (base of pointer table)
9
DCB "January",0             ; null-terminated string "January"
10
DCB "February",0            ; null-terminated string "February"
11
DCB "March",0               ; null-terminated string "March"
12
DCB "April",0               ; null-terminated string "April"
13
DCB "May",0                 ; null-terminated string "May"
14
DCB "June",0                ; null-terminated string "June"
15
DCB "July",0                ; null-terminated string "July"
16
DCB "August",0              ; null-terminated string "August"
17
DCB "September",0           ; null-terminated string "September"
18
DCB "October",0             ; null-terminated string "October"
19
DCB "November",0            ; null-terminated string "November"
20
DCB "December",0            ; null-terminated string "December"
21

22
AREA ||.data||, DATA, ALIGN=2
23
month1 DCD ||.conststring||         ; pointer to "January"
24
       DCD ||.conststring||+0x8     ; pointer to "February"
25
       DCD ||.conststring||+0x11    ; pointer to "March"
26
       DCD ||.conststring||+0x17    ; pointer to "April"
27
       DCD ||.conststring||+0x1d    ; pointer to "May"
28
       DCD ||.conststring||+0x21    ; pointer to "June"
29
       DCD ||.conststring||+0x26    ; pointer to "July"
30
       DCD ||.conststring||+0x2b    ; pointer to "August"
31
       DCD ||.conststring||+0x32    ; pointer to "September"
32
       DCD ||.conststring||+0x3c    ; pointer to "October"
33
       DCD ||.conststring||+0x44    ; pointer to "November"
34
       DCD ||.conststring||+0x4d    ; pointer to "December"

The address of the table is loaded into R1. Everything else is done with a single LDR instruction. The input value month is left-shifted by 2 (i.e. multiplied by 4), added to R1 (the table address), and then the table element is loaded from that address. The 32-bit element is loaded into R0 from the table.

ARM in Thumb mode

The code is similar to the one above, but less dense, because the LSL suffix cannot be embedded inside an LDR instruction here:

1
get_month1 PROC
2
    LSLS r0,r0,#2       ; R0 = month * 4 (left shift by 2 = multiply by 4, compute byte offset)
3
    LDR r1,|L0.64|      ; load base address of pointer table into R1
4
    LDR r0,[r1,r0]      ; load pointer from table: base(R1) + offset(R0)
5
    BX lr               ; return (branch to link register)
6
ENDP

ARM64

Listing 1.241: Optimizing GCC 4.9 ARM64

1
get_month1:
2
    adrp x1, .LANCHOR0          ; load page address of pointer table into X1 (upper bits)
3
    add x1, x1, :lo12:.LANCHOR0 ; add page offset to get full address of table in X1
4
    ldr x0, [x1,w0,sxtw 3]      ; load pointer: base(X1) + sign-extend(W0) * 8 (sxtw 3 = shift left 3 = multiply by 8)
5
    ret                          ; return the pointer in X0
6

7
.LANCHOR0 = . + 0
8
.type month1, %object
9
.size month1, 96                 ; total size: 12 pointers × 8 bytes = 96 bytes
10
month1:
11
.xword .LC2                      ; pointer to "January"
12
.xword .LC3                      ; pointer to "February"
13
.xword .LC4                      ; pointer to "March"
14
.xword .LC5                      ; pointer to "April"
15
.xword .LC6                      ; pointer to "May"
16
.xword .LC7                      ; pointer to "June"
17
.xword .LC8                      ; pointer to "July"
18
.xword .LC9                      ; pointer to "August"
19
.xword .LC10                     ; pointer to "September"
20
.xword .LC11                     ; pointer to "October"
21
.xword .LC12                     ; pointer to "November"
22
.xword .LC13                     ; pointer to "December"
23

24
.LC2: .string "January"
25
.LC3: .string "February"
26
... (rest is the same)

The address of the table is loaded into X1 using the ADRP/ADD pair. After that the appropriate element is selected with a single LDR, which takes W0 (the register holding the month argument), left-shifts it by 3 bits (i.e. multiplies by 8), sign-extends it (that is what the sxtw suffix does), and adds it to X1. The 64-bit value is then loaded from the table into X0.

MIPS

Listing 1.242: Optimizing GCC 4.4.5 (IDA)

1
get_month1:
2
    la $v0, month1          ; load base address of pointer table into $v0
3
    sll $a0, 2              ; $a0 = month * 4 (left shift by 2 = multiply by 4, compute byte offset)
4
    addu $a0, $v0           ; $a0 = table base + byte offset
5
    lw $v0, 0($a0)          ; load pointer from table at that address into $v0
6
    jr $ra                  ; return (jump to return address)
7
    or $at, $zero           ; branch delay slot (NOP)
8

9
.data
10
globl month1
11
month1:
12
    .word aJanuary          ; pointer to "January"
13
    .word aFebruary         ; pointer to "February"
14
    ... (rest is the same)
15

16
.data
17
aJanuary: .ascii "January"<0>   ; null-terminated string "January"
18
    ... (all strings the same)

Array overflow

Our function accepts values from 0 to 11. What happens if someone passes 12?

Simply put, there is no element at index 12 in this function, so it will return a random or strange value.

The function will load whatever value happens to be sitting there and return it. Later, another function might try to fetch a text string from that address and could crash.

Let's compile the example in MSVC for win64 and open it in IDA to see what the linker placed after the table:

Listing 1.243: Executable file in IDA

1
off_140011000 dq offset aJanuary_1     ; DATA XREF: .text:0000000140001003 ; pointer to "January"
2
              dq offset aFebruary_1    ; pointer to "February"
3
              dq offset aMarch_1       ; pointer to "March"
4
              dq offset aApril_1       ; pointer to "April"
5
              dq offset aMay_1         ; pointer to "May"
6
              dq offset aJune_1        ; pointer to "June"
7
              dq offset aJuly_1        ; pointer to "July"
8
              dq offset aAugust_1      ; pointer to "August"
9
              dq offset aSeptember_1   ; pointer to "September"
10
              dq offset aOctober_1     ; pointer to "October"
11
              dq offset aNovember_1    ; pointer to "November"
12
              dq offset aDecember_1    ; pointer to "December"
13

14
aJanuary_1   db 'January',0    ; DATA XREF: sub_140001020+4 ; .data:off_140011000
15
aFebruary_1  db 'February',0   ; DATA XREF: .data:0000000140011008
16
             align 4
17
aMarch_1     db 'March',0      ; DATA XREF: .data:0000000140011010
18
             align 4
19
aApril_1     db 'April',0      ; DATA XREF: .data:0000000140011018

Our program is very small, so there is not much data to place in the data segment — just the month names. But we must keep in mind that the linker might have placed something else there by chance.

So what happens if we pass 12 to the function? It will return the 13th element. Let's see how the CPU treats those bytes as a 64-bit value:

Listing 1.244: Executable file in IDA

1
off_140011000 dq offset qword_140011060    ; DATA XREF: .text:0000000140001003 ; element at index 12 — points past the table
2
              dq offset aFebruary_1        ; pointer to "February"
3
              dq offset aMarch_1           ; pointer to "March"
4
              dq offset aApril_1           ; pointer to "April"
5
              dq offset aMay_1             ; pointer to "May"
6
              dq offset aJune_1            ; pointer to "June"
7
              dq offset aJuly_1            ; pointer to "July"
8
              dq offset aAugust_1          ; pointer to "August"
9
              dq offset aSeptember_1       ; pointer to "September"
10
              dq offset aOctober_1         ; pointer to "October"
11
              dq offset aNovember_1        ; pointer to "November"
12
              dq offset aDecember_1        ; pointer to "December"
13

14
qword_140011060 dq 797261756E614Ah   ; DATA XREF: sub_140001020+4 ; raw bytes sitting after the table — not a valid pointer
15
                                     ; this is the first 8 bytes of "January" interpreted as a 64-bit integer
16

17
aFebruary_1 db 'February',0    ; DATA XREF: .data:0000000140011008
18
            align 4
19
aMarch_1    db 'March',0       ; DATA XREF: .data:0000000140011010

That value is 0x797261756E614A.

After that, another function (most likely one that processes strings) might try to read bytes from that address, expecting a C-string to be there. It will almost certainly crash, because that value does not look like a valid address.

Array overflow protection

It is naive to expect that every programmer who uses our function or library will respect the rule of not passing a value greater than 11. One way to handle this in C/C++ is to use assertions.

We can modify our program to fail if the value is wrong:

Listing 1.245: assert() added

1
const char* get_month1_checked (int month) { // function returns month name with bounds checking
2
    assert (month<12); // terminate the program if month is out of range (>= 12)
3
    return month1[month]; // return pointer to the month name string
4
};

The assertion macro checks for valid values at the beginning of each function, and fails if the condition turns out to be false.

Listing 1.246: Optimizing MSVC 2013 x64

1
$SG3143 DB 'm', 00H, 'o', 00H, 'n', 00H, 't', 00H, 'h', 00H, '.', 00H
2
         DB 'c', 00H, 00H, 00H   ; UTF-16 encoded filename string: "month.c"
3

4
$SG3144 DB 'm', 00H, 'o', 00H, 'n', 00H, 't', 00H, 'h', 00H, '<', 00H
5
         DB '1', 00H, '2', 00H, 00H, 00H ; UTF-16 encoded condition string: "month<12"
6

7
month$ = 48                             ; stack offset of the 'month' argument
8
get_month1_checked PROC
9
$LN5:
10
    push    rbx                         ; save RBX (callee-saved register)
11
    sub     rsp, 32                     ; allocate shadow space for function calls
12
    movsxd  rbx, ecx                   ; sign-extend month (32-bit) into RBX (64-bit)
13
    cmp     ebx, 12                     ; compare month with 12
14
    jl      SHORT $LN3@get_month1       ; if month < 12, skip the assert and go to normal path
15
    lea     rdx, OFFSET FLAT:$SG3143    ; load filename string "month.c" as second argument
16
    lea     rcx, OFFSET FLAT:$SG3144    ; load condition string "month<12" as first argument
17
    mov     r8d, 29                     ; line number (29) as third argument
18
    call    _wassert                    ; call assert failure handler (prints info and terminates)
19
$LN3@get_month1:
20
    lea     rcx, OFFSET FLAT:month1     ; load base address of pointer table into RCX
21
    mov     rax, QWORD PTR [rcx+rbx*8] ; load pointer: base + month*8 (each pointer = 8 bytes)
22
    add     rsp, 32                     ; restore shadow space
23
    pop     rbx                         ; restore RBX
24
    ret     0                           ; return the pointer in RAX
25
get_month1_checked ENDP

In reality, assert() is not a function — it is a macro. It checks the condition, and then also passes the line number and filename to another function that reports the information to the user. Here we can see that the filename and the condition are written in UTF-16. The line number is also passed (it is 29). This approach is present in almost all compilers.

Here is what GCC does:

Listing 1.247: Optimizing GCC 4.9 x64

1
.LC1: .string "month.c"   ; filename string passed to assert failure handler
2
.LC2: .string "month<12"  ; condition string passed to assert failure handler
3

4
get_month1_checked:
5
    cmp     edi, 11                             ; compare month with 11
6
    jg      .L6                                 ; if month > 11 (i.e. >= 12), jump to assert handler
7
    movsx   rdi, edi                            ; sign-extend month into RDI (64-bit)
8
    mov     rax, QWORD PTR month1[0+rdi*8]      ; load pointer from table: base + month*8
9
    ret                                         ; return the pointer in RAX
10

11
.L6:
12
    push    rax                                         ; align stack to 16 bytes
13
    mov     ecx, OFFSET FLAT:__PRETTY_FUNCTION__.2423   ; function name as 4th argument
14
    mov     edx, 29                                     ; line number as 3rd argument
15
    mov     esi, OFFSET FLAT:.LC1                       ; filename as 2nd argument
16
    mov     edi, OFFSET FLAT:.LC2                       ; condition string as 1st argument
17
    call    __assert_fail                               ; call assert failure handler (terminates program)
18

19
__PRETTY_FUNCTION__.2423: .string "get_month1_checked" ; function name string used by GCC assert

The GCC macro also passes the function name for convenience. Nothing comes for free, and sanitizing checks are no exception. They slow the program down, especially if assert() is present in small, time-critical functions. MSVC for example leaves the checks in debug builds, but in release builds they all disappear. The Windows NT kernel ships in two versions: "checked" and "free" builds. The first contains validation checks (hence "checked"), the second does not (hence "free" from the checks). The "checked" kernel runs slower because of all those checks, which is why it is only used in debugging sessions.

Accessing specific character

An array of pointers to strings can be accessed like this:

1
#include <stdio.h> // include standard I/O header
2

3
const char* month[] = { // array of pointers to month name strings
4
    "January", "February", "March", "April", "May", "June",
5
    "July", "August", "September", "October", "November", "December"
6
};
7

8
int main() {
9
    // 4th month, 5th character:
10
    printf ("%c\n", month[3][4]); // month[3] = "April", month[3][4] = 'l' (5th character, 0-indexed)
11
};

Because month[3] is an expression of type const char*. Then the fifth character is taken from that expression by adding 4 bytes to its address.

By the way, the list of arguments passed to main() has the same type:

1
#include <stdio.h> // include standard I/O header
2

3
int main(int argc, char *argv[]) { // argv is an array of pointers to argument strings
4
    printf ("3rd argument, 2nd character: %c\n", argv[3][1]); // argv[3] = 3rd argument string, [1] = 2nd character
5
};

It is very important to understand that, even though the syntax looks similar, this is completely different from the two-dimensional arrays we will talk about next.

Another important thing to notice: the strings we access must be encoded in a system where each character is one byte, such as ASCII and extended ASCII. UTF-8 will not work here.

1.26.6 Multidimensional arrays

Internally, a multidimensional array is exactly the same thing as a one-dimensional array. Since computer memory is linear, it is one single one-dimensional array. For convenience, this multidimensional array can easily be represented as a one-dimensional array.

To be more specific, a two-dimensional array (such as char a[3][4]) is not real in memory — it is just a long one-dimensional array (12 consecutive cells). The compiler simply lets us write a[x][y] for our convenience, but internally it calculates the address itself.

For example, this is how the elements of a 3×4 array are arranged in a one-dimensional array of 12 cells:

And here is how each cell of a 3×4 array is laid out in memory:

So, to calculate the address of the element we want, we first multiply the first index by 4 (the width of the array) and then add the second index.

To make it simple — assume you have the array char a[3][4], which has 3 rows and 4 columns = 12 cells in memory. Memory is linear, like a number strip from 0 to 11.

To reach a specific cell (for example a[1][2]), the compiler does a very simple calculation:

* The first index = the row number (here 1)

* The array width = number of columns = 4

* Multiply: 1 × 4 = 4 (meaning: "skip the entire first row")

* The second index = the column number (here 2)

* Add: 4 + 2 = 6

So a[1][2] is located at position 6 in the linear strip.

This is called row-major order, and this method of representing arrays is used in C/C++ and Python at least. The term row-major order in plain English means: "first write the elements of the first row, then the second row … and finally the elements of the last row."

There is another representation method called column-major order (where the indices are used in the reverse order), and that is used in Fortran, MATLAB, and R at least. The term column-major order in plain English means: "first write the elements of the first column, then the second column … and finally the elements of the last column."

If we ask ourselves which approach is better — in general, from a performance and cache memory perspective, the best way to organize data is the one where elements are accessed sequentially. So if your function accesses data row by row (per row), then row-major order is better, and vice versa.

Two-dimensional array example

We will work with an array of type char, meaning each element needs only one byte in memory.

Row filling example

Let's fill the second row with the values 0..3:

Listing 1.248: Row filling example

1
#include <stdio.h> // include standard I/O header
2

3
char a[3][4]; // global 2D array: 3 rows × 4 columns = 12 bytes total
4

5
int main() {
6
    int x, y; // loop counters
7

8
    // clear array
9
    for (x=0; x<3; x++)        // iterate over rows
10
        for (y=0; y<4; y++)    // iterate over columns
11
            a[x][y]=0;         // set every element to zero
12

13
    // fill second row by 0..3:
14
    for (y=0; y<4; y++)        // iterate over columns of row 1
15
        a[1][y]=y;             // a[1][0]=0, a[1][1]=1, a[1][2]=2, a[1][3]=3
16
};

The three rows are distinct, and we will see that the second row now contains the values 0, 1, 2, and 3.

Column filling example

Let's fill the third column with the values 0..2:

1
#include <stdio.h> // include standard I/O header
2

3
char a[3][4]; // global 2D array: 3 rows × 4 columns = 12 bytes total
4

5
int main() {
6
    int x, y; // loop counters
7

8
    // clear array
9
    for (x=0; x<3; x++)        // iterate over rows
10
        for (y=0; y<4; y++)    // iterate over columns
11
            a[x][y]=0;         // set every element to zero
12

13
    // fill third column by 0..2:
14
    for (x=0; x<3; x++)        // iterate over rows
15
        a[x][2]=x;             // a[0][2]=0, a[1][2]=1, a[2][2]=2
16
};

Across the three rows, we will see that in each row, the third position holds the written values: 0, 1, and 2.

Access two-dimensional array as one-dimensional

We can easily verify that it is possible to access a two-dimensional array as if it were a one-dimensional array, and there are at least two ways to do that:

1
#include <stdio.h> // include standard I/O header
2

3
char a[3][4]; // global 2D array: 3 rows × 4 columns = 12 bytes
4

5
char get_by_coordinates1 (char array[3][4], int a, int b) { // access using 2D array syntax
6
    return array[a][b]; // compiler handles address calculation automatically
7
};
8

9
char get_by_coordinates2 (char *array, int a, int b) {
10
    // treat input array as one-dimensional
11
    // 4 is array width here
12
    return array[a*4+b]; // manually compute linear index: row*width + column
13
};
14

15
char get_by_coordinates3 (char *array, int a, int b) {
16
    // treat input array as pointer,
17
    // calculate address, get value at it
18
    // 4 is array width here
19
    return *(array+a*4+b); // pointer arithmetic: base + row*width + column
20
};
21

22
int main() {
23
    a[2][3]=123;                              // set element at row 2, column 3 to 123
24
    printf ("%d\n", get_by_coordinates1(a, 2, 3)); // all three should print 123
25
    printf ("%d\n", get_by_coordinates2(a, 2, 3));
26
    printf ("%d\n", get_by_coordinates3(a, 2, 3));
27
};

When we compile and run it: the correct values come out. What MSVC 2013 did is something remarkable — all three routines produced the exact same code.

Listing 1.250: Optimizing MSVC 2013 x64

1
; All three functions compile to identical machine code
2
array$ = 8          ; first argument: base address of array (in RCX)
3
a$ = 16             ; second argument: row index (in RDX)
4
b$ = 24             ; third argument: column index (in R8)
5

6
get_by_coordinates3 PROC
7
; RCX = address of array
8
; RDX = a (row index)
9
; R8  = b (column index)
10
    movsxd  rax, r8d            ; sign-extend b (32-bit) into RAX (64-bit)
11
    movsxd  r9, edx             ; sign-extend a (32-bit) into R9 (64-bit)
12
    add     rax, rcx            ; RAX = b + base address of array
13
    movzx   eax, BYTE PTR [rax+r9*4]
14
    ; AL = load byte at address (b + base) + a*4 = base + a*4 + b = a[a][b]
15
    ret     0
16
get_by_coordinates3 ENDP
17

18
array$ = 8
19
a$ = 16
20
b$ = 24
21
get_by_coordinates2 PROC        ; identical code to get_by_coordinates3
22
    movsxd  rax, r8d            ; sign-extend b into RAX
23
    movsxd  r9, edx             ; sign-extend a into R9
24
    add     rax, rcx            ; RAX = b + base
25
    movzx   eax, BYTE PTR [rax+r9*4] ; load byte at base + a*4 + b
26
    ret     0
27
get_by_coordinates2 ENDP
28

29
array$ = 8
30
a$ = 16
31
b$ = 24
32
get_by_coordinates1 PROC        ; identical code to the other two
33
    movsxd  rax, r8d            ; sign-extend b into RAX
34
    movsxd  r9, edx             ; sign-extend a into R9
35
    add     rax, rcx            ; RAX = b + base
36
    movzx   eax, BYTE PTR [rax+r9*4] ; load byte at base + a*4 + b
37
    ret     0
38
get_by_coordinates1 ENDP

GCC also generates equivalent routines, but slightly different:

1
; RDI = address of array
2
; RSI = a (row index)
3
; RDX = b (column index)
4

5
get_by_coordinates1:
6
    ; sign-extend input 32-bit int values "a" and "b" to 64-bit ones
7
    movsx   rsi, esi                ; sign-extend a into RSI (64-bit)
8
    movsx   rdx, edx                ; sign-extend b into RDX (64-bit)
9
    lea     rax, [rdi+rsi*4]        ; RAX = base + a*4
10
    movzx   eax, BYTE PTR [rax+rdx]
11
    ; AL = load byte at (base + a*4) + b = base + a*4 + b
12
    ret
13

14
get_by_coordinates2:
15
    lea     eax, [rdx+rsi*4]        ; EAX = b + a*4 (compute linear index in 32-bit)
16
    cdqe                            ; sign-extend EAX to RAX (64-bit)
17
    movzx   eax, BYTE PTR [rdi+rax]
18
    ; AL = load byte at base + (b + a*4)
19
    ret
20

21
get_by_coordinates3:
22
    sal     esi, 2                  ; ESI = a << 2 = a*4
23
    ; sign-extend input 32-bit int values "a*4" and "b" to 64-bit ones
24
    movsx   rdx, edx                ; sign-extend b into RDX (64-bit)
25
    movsx   rsi, esi                ; sign-extend a*4 into RSI (64-bit)
26
    add     rdi, rsi                ; RDI = base + a*4
27
    movzx   eax, BYTE PTR [rdi+rdx]
28
    ; AL = load byte at (base + a*4) + b
29
    ret

Three-dimensional array example

The same story applies to multidimensional arrays. Now we will work with an array of type int — each element needs 4 bytes in memory. Let's see:

1
#include <stdio.h> // include standard I/O header
2

3
int a[10][20][30]; // global 3D array: 10 × 20 × 30 = 6000 integers = 24000 bytes
4

5
void insert(int x, int y, int z, int value) { // write a value into the 3D array
6
    a[x][y][z]=value; // compiler computes: base + x*600*4 + y*30*4 + z*4
7
};

x86

When compiled in MSVC 2010:

Listing 1.253: MSVC 2010

1
_DATA SEGMENT
2
COMM _a:DWORD:01770H    ; reserve 0x1770 = 6000 DWORDs = 24000 bytes for array a
3
_DATA ENDS
4

5
PUBLIC _insert
6
_TEXT SEGMENT
7
_x$     = 8    ; size = 4 ; stack offset of argument x
8
_y$     = 12   ; size = 4 ; stack offset of argument y
9
_z$     = 16   ; size = 4 ; stack offset of argument z
10
_value$ = 20   ; size = 4 ; stack offset of argument value
11

12
_insert PROC
13
    push    ebp
14
    mov     ebp, esp
15
    mov     eax, DWORD PTR _x$[ebp]
16
    imul    eax, 2400               ; EAX = x * 2400  (= x * 600 elements * 4 bytes)
17
    mov     ecx, DWORD PTR _y$[ebp]
18
    imul    ecx, 120                ; ECX = y * 120   (= y * 30 elements * 4 bytes)
19
    lea     edx, DWORD PTR _a[eax+ecx]
20
    ; EDX = base_of_a + x*2400 + y*120
21
    mov     eax, DWORD PTR _z$[ebp]
22
    mov     ecx, DWORD PTR _value$[ebp]
23
    mov     DWORD PTR [edx+eax*4], ecx
24
    ; store value at address EDX + z*4 = base + x*2400 + y*120 + z*4
25
    pop     ebp
26
    ret     0
27
_insert ENDP
28
_TEXT ENDS

Nothing special. To compute the index, the three input arguments are used in the formula:

address = 600 · 4 · x + 30 · 4 · y + 4z

in order to represent the array as multidimensional. Note that the type int is 32 bits (4 bytes), which is why all the coefficients must be multiplied by 4.

Listing 1.254: GCC 4.4.1

1
public insert
2
insert proc near
3
x     = dword ptr  8    ; stack offset of argument x
4
y     = dword ptr  0Ch  ; stack offset of argument y
5
z     = dword ptr  10h  ; stack offset of argument z
6
value = dword ptr  14h  ; stack offset of argument value
7

8
    push    ebp
9
    mov     ebp, esp
10
    push    ebx
11
    mov     ebx, [ebp+x]    ; EBX = x
12
    mov     eax, [ebp+y]    ; EAX = y
13
    mov     ecx, [ebp+z]    ; ECX = z
14
    lea     edx, [eax+eax]  ; EDX = y*2
15
    mov     eax, edx        ; EAX = y*2
16
    shl     eax, 4          ; EAX = (y*2) << 4 = y*2*16 = y*32
17
    sub     eax, edx        ; EAX = y*32 - y*2 = y*30
18
    imul    edx, ebx, 600   ; EDX = x*600
19
    add     eax, edx        ; EAX = y*30 + x*600
20
    lea     edx, [eax+ecx]  ; EDX = y*30 + x*600 + z
21
    mov     eax, [ebp+value]
22
    mov     dword ptr ds:a[edx*4], eax
23
    ; store value at address a + (x*600 + y*30 + z)*4
24
    pop     ebx
25
    pop     ebp
26
    retn
27
insert endp

GCC handled it differently. For one of the operations in the calculation (30y), GCC generated code without any multiply instructions. Here is what happened:

(y+y) ≪ 4 − (y+y) = (2y) ≪ 4 − 2y = 2·16·y − 2y = 32y − 2y = 30y

So to calculate 30y it used one addition + one shift + one subtraction. That is faster.

ARM + Non-optimizing Xcode 4.6.3 (LLVM) (Thumb mode)

Listing 1.255: Non-optimizing Xcode 4.6.3 (LLVM) (Thumb mode)

1
_insert
2
; value = -0x10
3
; z     = -0xC
4
; y     = -8
5
; x     = -4
6

7
    SUB     SP, SP, #0x10           ; allocate 16 bytes on stack for 4 int-sized local variables
8

9
    MOV     R9, 0xFC2               ; load PC-relative offset to get address of array a
10
    ADD     R9, PC
11
    LDR.W   R9, [R9]               ; R9 = pointer to array a
12

13
    STR     R0, [SP, #0x10+x]      ; save x onto local stack
14
    STR     R1, [SP, #0x10+y]      ; save y onto local stack
15
    STR     R2, [SP, #0x10+z]      ; save z onto local stack
16
    STR     R3, [SP, #0x10+value]  ; save value onto local stack
17

18
    LDR     R0, [SP, #0x10+value]  ; reload value from stack
19
    LDR     R1, [SP, #0x10+z]      ; reload z from stack
20
    LDR     R2, [SP, #0x10+y]      ; reload y from stack
21
    LDR     R3, [SP, #0x10+x]      ; reload x from stack
22

23
    MOV     R12, 2400
24
    MUL.W   R3, R3, R12            ; R3 = x * 2400  (= x * 600 elements * 4 bytes)
25
    ADD     R3, R9                  ; R3 = base_of_a + x*2400
26

27
    MOV     R9, 120
28
    MUL.W   R2, R2, R9             ; R2 = y * 120   (= y * 30 elements * 4 bytes)
29
    ADD     R2, R3                  ; R2 = base + x*2400 + y*120
30

31
    LSLS    R1, R1, #2             ; R1 = z * 4     (left shift by 2 = multiply by 4)
32
    ADD     R1, R2                  ; R1 = base + x*2400 + y*120 + z*4  (final element address)
33

34
    STR     R0, [R1]               ; store value at computed address → a[x][y][z] = value
35

36
    ADD     SP, SP, #0x10          ; deallocate local stack space
37
    BX      LR                     ; return

The non-optimizing LLVM saves all variables onto the local stack, which is unnecessary extra work. The address of the array element is calculated using the same formula we saw before.

ARM + Optimizing Xcode 4.6.3 (LLVM) (Thumb mode)

Listing 1.256: Optimizing Xcode 4.6.3 (LLVM) (Thumb mode)

1
_insert
2
    MOVW    R9, #0x10FC
3
    MOVT.W  R9, #0
4
    ADD     R9, PC
5
    LDR.W   R9, [R9]               ; R9 = pointer to array a
6

7
    MOV.W   R12, #2400             ; load constant 2400 (= 600 elements * 4 bytes) into R12
8

9
    RSB.W   R1, R1, R1, LSL #4    ; R1 = y*16 - y = y*15  (RSB = Reverse Subtract: dst = op2 - op1)
10
                                   ; note: actual target is y*30, next step doubles it
11

12
    MLA.W   R0, R0, R12, R9       ; R0 = x*2400 + R9  (MLA = Multiply-Accumulate: R0 = R0*R12 + R9)
13
                                   ; R0 = base_of_a + x*2400
14

15
    ADD.W   R0, R0, R1, LSL #3    ; R0 = R0 + R1*8 = base + x*2400 + y*15*8 = base + x*2400 + y*120
16

17
    STR.W   R3, [R0, R2, LSL #2]  ; store value(R3) at address R0 + z*4 → a[x][y][z] = value
18

19
    BX      LR                     ; return

The tricks of replacing multiplication with shifts + addition + subtraction that we saw before are also present here. Here we also see a new instruction: RSB (Reverse Subtract). It works exactly like SUB, but it swaps the operands before executing.

Why? SUB and RSB are instructions where a shift coefficient can be applied to the second operand (such as LSL#4). But that coefficient can only be applied to the second operand. That is fine for commutative operations (like addition or multiplication, where you can swap the operands without changing the result). But subtraction is a non-commutative operation, which is why RSB exists for exactly these cases.

MIPS

My example is very small, so GCC decided to place the array a inside the 64KiB region reachable by the Global Pointer.

Listing 1.257: Optimizing GCC 4.4.5 (IDA)

1
insert:
2
; $a0 = x
3
; $a1 = y
4
; $a2 = z
5
; $a3 = value
6

7
        sll     $v0, $a0, 5         ; $v0 = $a0 << 5 = x*32
8
        sll     $a0, 3              ; $a0 = $a0 << 3 = x*8
9
        addu    $a0, $v0            ; $a0 = x*8 + x*32 = x*40
10

11
        sll     $v1, $a1, 5         ; $v1 = $a1 << 5 = y*32
12
        sll     $v0, $a0, 4         ; $v0 = $a0 << 4 = x*40*16 = x*640
13
        sll     $a1, 1              ; $a1 = $a1 << 1 = y*2
14

15
        subu    $a1, $v1, $a1       ; $a1 = y*32 - y*2 = y*30
16
        subu    $a0, $v0, $a0       ; $a0 = x*640 - x*40 = x*600
17

18
        la      $gp, __gnu_local_gp ; load global pointer
19

20
        addu    $a0, $a1, $a0       ; $a0 = y*30 + x*600
21
        addu    $a0, $a2            ; $a0 = y*30 + x*600 + z  (final linear index)
22

23
        ; load address of table:
24
        lw      $v0, (a & 0xFFFF)($gp) ; $v0 = base address of array a
25

26
        ; multiply index by 4 to seek array element (each int = 4 bytes):
27
        sll     $a0, 2              ; $a0 = (x*600 + y*30 + z) * 4
28

29
        ; sum up multiplied index and table address:
30
        addu    $a0, $v0, $a0       ; $a0 = base + (x*600 + y*30 + z)*4
31

32
        ; store value into table and return:
33
        sw      $a3, 0($a0)         ; a[x][y][z] = value
34
        jr      $ra                 ; return

Getting dimensions of multidimensional array

Any function that processes strings, if you pass it an array of characters, cannot know the size of the array it received. Likewise, if a function processes a 2D array, only one of the dimensions can be determined.

An example of this:

1
int get_element(int array[10][20], int x, int y) { // function receives a 2D array
2
    return array[x][y]; // access element at row x, column y
3
};
4

5
int main() {
6
    int array[10][20]; // declare a 10×20 array
7
    get_element(array, 4, 5); // call with x=4, y=5
8
};

...if compiled (with any compiler) and then decompiled with Hex-Rays:

1
int get_element(int *array, int x, int y) { // Hex-Rays sees only a pointer, not a 2D array
2
    return array[20 * x + y]; // only the second dimension (20) is visible — first dimension is lost
3
}

There is no way to know the size of the first dimension. If the x that was passed is too large, a buffer overflow will happen and it will read an element from a random location in memory.

And a 3D array:

1
int get_element(int array[10][20][30], int x, int y, int z) { // function receives a 3D array
2
    return array[x][y][z]; // access element at [x][y][z]
3
};
4

5
int main() {
6
    int array[10][20][30]; // declare a 10×20×30 array
7
    get_element(array, 4, 5, 6); // call with x=4, y=5, z=6
8
};

Hex-Rays:

1
int get_element(int *array, int x, int y, int z) { // only a pointer is visible
2
    return array[600 * x + z + 30 * y]; // only the last two dimensions (30, 20) survive — first is lost
3
}

Again, only two of the three dimensions can be determined.

1.26.7 Pack of strings as a two-dimensional array

Let's go back again to the function that returns the month name that we talked about before. As you can see, at least one memory load operation is needed in order to fetch the pointer to the string that is the month name. Is it possible to get rid of that memory load entirely? In fact yes, if we represent the list of strings as a two-dimensional array:

1
#include <stdio.h> // include standard I/O header
2

3
const char month2[12][10]= { // 2D array: 12 months × 10 bytes each (fixed-width storage)
4
    { 'J','a','n','u','a','r','y', 0, 0, 0 },   // "January"   + 3 padding zeros
5
    { 'F','e','b','r','u','a','r','y', 0, 0 },   // "February"  + 2 padding zeros
6
    { 'M','a','r','c','h', 0, 0, 0, 0, 0 },      // "March"     + 5 padding zeros
7
    { 'A','p','r','i','l', 0, 0, 0, 0, 0 },      // "April"     + 5 padding zeros
8
    { 'M','a','y', 0, 0, 0, 0, 0, 0, 0 },        // "May"       + 7 padding zeros
9
    { 'J','u','n','e', 0, 0, 0, 0, 0, 0 },       // "June"      + 6 padding zeros
10
    { 'J','u','l','y', 0, 0, 0, 0, 0, 0 },       // "July"      + 6 padding zeros
11
    { 'A','u','g','u','s','t', 0, 0, 0, 0 },      // "August"    + 4 padding zeros
12
    { 'S','e','p','t','e','m','b','e','r', 0 },   // "September" + 1 padding zero (longest name = 9 chars)
13
    { 'O','c','t','o','b','e','r', 0, 0, 0 },     // "October"   + 3 padding zeros
14
    { 'N','o','v','e','m','b','e','r', 0, 0 },    // "November"  + 2 padding zeros
15
    { 'D','e','c','e','m','b','e','r', 0, 0 }     // "December"  + 2 padding zeros
16
};
17

18
// in 0..11 range
19
const char* get_month2 (int month) { // return pointer to the start of the month name
20
    return &month2[month][0]; // address = base + month*10 (no pointer table needed)
21
};

And here is what happened:

Listing 1.258: Optimizing MSVC 2013 x64

1
month2  DB 04aH, 061H, 06eH, 075H, 061H, 072H, 079H, 00H, 00H, 00H
2
        ; ... (rest of the month data follows)
3

4
get_month2 PROC
5
        movsxd  rax, ecx                    ; sign-extend month (32-bit) into RAX (64-bit)
6
        lea     rcx, QWORD PTR [rax+rax*4]  ; RCX = month + month*4 = month*5
7
        lea     rax, OFFSET FLAT:month2     ; RAX = base address of the 2D table
8
        lea     rax, QWORD PTR [rax+rcx*2]  ; RAX = base + month*5*2 = base + month*10
9
        ret     0
10
get_month2 ENDP

No memory access at all. All the function does is calculate the address where the first character of the month name will be: pointer_to_the_table + month * 10. There are also two LEA instructions, which effectively act like MUL + MOV. The array width is 10 bytes. Indeed, the longest string here is "September" which is 9 bytes + a null terminator = 10 bytes. The remaining month names are padded with zero bytes so they all occupy the same space (10 bytes). This means the function became much faster because the start of each string can be computed at a fixed, easily calculated address.

Optimizing GCC 4.9 made it even shorter:

Listing 1.259: Optimizing GCC 4.9 x64

1
get_month2:
2
        movsx   rdi, edi                ; sign-extend month (32-bit) into RDI (64-bit)
3
        lea     rax, [rdi+rdi*4]        ; RAX = month + month*4 = month*5
4
        lea     rax, month2[rax+rax]    ; RAX = base + month*5*2 = base + month*10
5
        ret

LEA is also used here for multiplication by 10.

Non-optimizing compilers do the multiplication differently.

Listing 1.260: Non-optimizing GCC 4.9 x64

1
get_month2:
2
        push    rbp
3
        mov     rbp, rsp
4
        mov     DWORD PTR [rbp-4], edi      ; spill month argument onto stack
5
        mov     eax, DWORD PTR [rbp-4]      ; reload month from stack
6
        movsx   rdx, eax                    ; RDX = sign-extended month (64-bit)
7
        mov     rax, rdx                    ; RAX = month
8
        sal     rax, 2                      ; RAX = month * 4
9
        add     rax, rdx                    ; RAX = month*4 + month = month*5
10
        add     rax, rax                    ; RAX = month*5 * 2 = month*10
11
        add     rax, OFFSET FLAT:month2     ; RAX = base + month*10  (final address)
12
        pop     rbp
13
        ret

Listing 1.261: Non-optimizing MSVC 2013 x64

1
month$ = 8
2
get_month2 PROC
3
        mov     DWORD PTR [rsp+8], ecx              ; spill month argument onto stack
4
        movsxd  rax, DWORD PTR month$[rsp]          ; RAX = sign-extended month (64-bit)
5
        imul    rax, rax, 10                        ; RAX = month * 10
6
        lea     rcx, OFFSET FLAT:month2             ; RCX = base address of the table
7
        add     rcx, rax                            ; RCX = base + month*10  (final address)
8
        mov     rax, rcx                            ; RAX = result
9
        mov     ecx, 1
10
        imul    rcx, rcx, 0                         ; RCX = 1*0 = 0  (strange! this does nothing useful)
11
        add     rax, rcx                            ; RAX = result + 0  (adding zero, no effect)
12
        ret     0
13
get_month2 ENDP

There is something strange here: why does it multiply by zero and then add zero at the end? This is a quirk in the compiler's code generator that was not caught in testing (the code works correctly in the end). We are looking at this so you understand that sometimes you should not overthink these strange compiler quirks.

32-bit ARM

Optimizing Keil for Thumb mode uses the MULS instruction:

Listing 1.262: Optimizing Keil 6/2013 (Thumb mode)

1
; R0 = month
2
        MOVS    r1, #0xa            ; R1 = 10  (the array row width)
3
        MULS    r0, r1, r0          ; R0 = 10 * month  (byte offset into the table)
4
        LDR     r1, |L0.68|         ; R1 = base address of the table
5
        ADDS    r0, r0, r1          ; R0 = base + month*10  (final address of month string)
6
        BX      lr                  ; return

Optimizing Keil for ARM mode uses add + shift:

Listing 1.263: Optimizing Keil 6/2013 (ARM mode)

1
; R0 = month
2
        LDR     r1, |L0.104|            ; R1 = base address of the table
3
        ADD     r0, r0, r0, LSL #2      ; R0 = month + month*4 = month*5
4
        ADD     r0, r1, r0, LSL #1      ; R0 = base + month*5*2 = base + month*10
5
        BX      lr                      ; return

ARM64

Listing 1.264: Optimizing GCC 4.9 ARM64

1
; W0 = month
2
        sxtw    x0, w0                  ; X0 = sign-extend month (32-bit) to 64-bit
3
        adrp    x1, .LANCHOR1
4
        add     x1, x1, :lo12:.LANCHOR1 ; X1 = base address of the table (ADRP/ADD pair)
5
        add     x0, x0, x0, lsl 2       ; X0 = month + month*4 = month*5
6
        add     x0, x1, x0, lsl 1       ; X0 = base + month*5*2 = base + month*10
7
        ret                             ; return

MIPS

Listing 1.265: Optimizing GCC 4.4.5 (IDA)

1
.globl get_month2
2
get_month2:
3
; $a0 = month
4
        sll     $v0, $a0, 3         ; $v0 = month << 3 = month*8
5
        sll     $a0, 1              ; $a0 = month << 1 = month*2
6
        addu    $a0, $v0            ; $a0 = month*2 + month*8 = month*10  (byte offset)
7

8
        la      $v0, month2         ; $v0 = base address of the table
9
        addu    $v0, $a0            ; $v0 = base + month*10  (final address of month string)
10
        jr      $ra                 ; return

In short, this is a somewhat old technique for storing text. You will find it a lot in Oracle RDBMS for example. It is hard to say today whether it is worth doing on modern computers. But it is a very good example of arrays, which is why it was added to the book.

1.26.8 Conclusion

An array is a pack of values in memory sitting next to each other. This is true for any element type, even if those elements are structures. Accessing a specific element in an array is nothing more than calculating its address. That means a pointer to an array and the address of its first element are the same thing. That is why ptr[0] and *ptr are equivalent in C/C++.

An interesting thing: Hex-Rays often substitutes the first form for the second when it does not know it is dealing with a pointer to a full array and thinks it is dealing with a pointer to a single variable.

0xV3n0m

1.26.5 Array of pointers to strings

x64

Optimizing GCC 4.9 x64

32-bit MSVC

32-bit ARM

ARM in ARM mode

ARM in Thumb mode

ARM64

MIPS

Array overflow

Array overflow protection

Accessing specific character

1.26.6 Multidimensional arrays

Two-dimensional array example

Row filling example

Column filling example

Access two-dimensional array as one-dimensional

Three-dimensional array example

x86

ARM + Non-optimizing Xcode 4.6.3 (LLVM) (Thumb mode)

ARM + Optimizing Xcode 4.6.3 (LLVM) (Thumb mode)

MIPS

Getting dimensions of multidimensional array

1.26.7 Pack of strings as a two-dimensional array

32-bit ARM

ARM64

MIPS

1.26.8 Conclusion

Table of Contents