Compile the following program with optimization for x64 machine using the cl.exe compiler.
#include <stdio.h>
typedef struct {
int flags;
int n;
} bug_t;
bug_t _bug;
unsigned short _byteswap_ushort(unsigned short val); /* from <intrin.h> */
#pragma intrinsic(_byteswap_ushort)
int main(int argc, char *argv[])
{
volatile bug_t *bug = (volatile bug_t *)&_bug;
if (bug->flags & 0x200000)
printf("bad %d\n", _byteswap_ushort(bug->n));
else
printf("good %d\n", _byteswap_ushort(bug->n));
getchar();
return 0;
}
Though bug->flags is 0, the machine executes the true branch of the if. The following is the assembly output:
; Listing generated by Microsoft (R) Optimizing Compiler Version 16.00.30319.01
include listing.inc
INCLUDELIB OLDNAMES
EXTRN __security_check_cookie:PROC
EXTRN __imp_getchar:PROC
EXTRN __imp_printf:PROC
COMM _bug:QWORD
$SG-7 DB 'bad %d', 0aH, 00H
$SG-8 DB 'good %d', 0aH, 00H
PUBLIC main
; COMDAT pdata
; File c:\users\steve\documents\visual studio 2010\projects\test2\test2\test2.c
pdata SEGMENT
$pdata$main DD imagerel $LN6
DD imagerel $LN6+62
DD imagerel $unwind$main
pdata ENDS
; COMDAT xdata
xdata SEGMENT
$unwind$main DD 010401H
DD 04204H
; Function compile flags: /Ogtpy
xdata ENDS
; COMDAT main
_TEXT SEGMENT
argc$ = 48
argv$ = 56
main PROC ; COMDAT
; 13 : {
$LN6:
sub rsp, 40 ; 00000028H
; 14 : volatile bug_t *bug = (volatile bug_t *)&_bug;
; 15 : if (bug->flags & 0x200000)
mov eax, DWORD PTR _bug
; 16 : printf("bad %d\n", _byteswap_ushort(bug->n));
lea rcx, OFFSET FLAT:$SG-7
bt eax, 21
mov eax, DWORD PTR _bug+4
ror ax, 8
movzx edx, ax
jne SHORT $LN5@main
; 17 : else
; 18 : printf("good %d\n", _byteswap_ushort(bug->n));
lea rcx, OFFSET FLAT:$SG-8
$LN5@main:
call QWORD PTR __imp_printf
; 19 : getchar();
call QWORD PTR __imp_getchar
; 20 : return 0;
xor eax, eax
; 21 : }
add rsp, 40 ; 00000028H
ret 0
main ENDP
_TEXT ENDS
END
There are two weird things in the assembly output, which might be connected to the same problem:
1) bt 15 results in carry flag when bit 15 is high, but zero flag is tested for branching in jne
2) the _byteswap_ushort is compiled to ror. It is ok, but this operation is done between the "bt 15" and its branch (jne) and changes the carry flag that was set by bt 15 and need not to be touched till the branch.
This is very serious since we cannot trust the optimization / intrinsic functions byteswap.