GCC-3.4.6源代码学习笔记（39）

4.1.4. 根据目标平台调整选项

从c_common_post_options返回，继续process_options。回忆input_filename访问input_location的file域，这个域记录了当前正在编译的文件。

process_options (continue)

4283 input_filename = main_input_filename;

4284

4285 #ifdef OVERRIDE_OPTIONS

4286 /* Some machines may reject certain combinations of options. */

4287 OVERRIDE_OPTIONS;

4288 #endif

如果后端对于与目标平台相关的编译选项有特定的要求，则定义上面4285行的宏OVERRIDER_OPTIONS，提供自己的处理句柄。对于x86目标平台，这个宏被定义为以下的函数。

1050 void

1051 override_options (void) in i386.c

1052 {

1053 int i;

1054 /* Comes from final.c -- no real reason to change it. */

1055 #define MAX_CODE_ALIGN 16

1056

1057 static struct ptt

1058 {

1059 const struct processor_costs *cost; /* Processor costs */

1060 const int target_enable; /* Target flags to enable. */

1061 const int target_disable; /* Target flags to disable. */

1062 const int align_loop; /* Default alignments. */

1063 const int align_loop_max_skip;

1064 const int align_jump;

1065 const int align_jump_max_skip;

1066 const int align_func;

1067 }

1068 const processor_target_table[PROCESSOR_max] =

1069 {

1070 {&i386_cost, 0, 0, 4, 3, 4, 3, 4},

1071 {&i486_cost, 0, 0, 16, 15, 16, 15, 16},

1072 {&pentium_cost, 0, 0, 16, 7, 16, 7, 16},

1073 {&pentiumpro_cost, 0, 0, 16, 15, 16, 7, 16},

1074 {&k6_cost, 0, 0, 32, 7, 32, 7, 32},

1075 {&athlon_cost, 0, 0, 16, 7, 16, 7, 16},

1076 {&pentium4_cost, 0, 0, 0, 0, 0, 0, 0},

1077 {&k8_cost, 0, 0, 16, 7, 16, 7, 16}

1078 };

1079

1080 static const char * const cpu_names[] = TARGET_CPU_DEFAULT_NAMES;

1081 static struct pta

1082 {

1083 const char *const name; /* processor name or nickname. */

1084 const enum processor_type processor;

1085 const enum pta_flags

1086 {

1087 PTA_SSE = 1,

1088 PTA_SSE2 = 2,

1089 PTA_SSE3 = 4,

1090 PTA_MMX = 8,

1091 PTA_PREFETCH_SSE = 16,

1092 PTA_3DNOW = 32,

1093 PTA_3DNOW_A = 64,

1094 PTA_64BIT = 128

1095 } flags;

1096 }

1097 const processor_alias_table[] =

1098 {

1099 {"i386", PROCESSOR_I386, 0},

1100 {"i486", PROCESSOR_I486, 0},

1101 {"i586", PROCESSOR_PENTIUM, 0},

1102 {"pentium", PROCESSOR_PENTIUM, 0},

1103 {"pentium-mmx", PROCESSOR_PENTIUM, PTA_MMX},

1104 {"winchip-c6", PROCESSOR_I486, PTA_MMX},

1105 {"winchip2", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},

1106 {"c3", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},

1107 {"c3-2", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_PREFETCH_SSE | PTA_SSE},

1108 {"i686", PROCESSOR_PENTIUMPRO, 0},

1109 {"pentiumpro", PROCESSOR_PENTIUMPRO, 0},

1110 {"pentium2", PROCESSOR_PENTIUMPRO, PTA_MMX},

1111 {"pentium3", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_PREFETCH_SSE},

1112 {"pentium3m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_PREFETCH_SSE},

1113 {"pentium-m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_PREFETCH_SSE | PTA_SSE2},

1114 {"pentium4", PROCESSOR_PENTIUM4, PTA_SSE | PTA_SSE2

1115 | PTA_MMX | PTA_PREFETCH_SSE},

1116 {"pentium4m", PROCESSOR_PENTIUM4, PTA_SSE | PTA_SSE2

1117 | PTA_MMX | PTA_PREFETCH_SSE},

1118 {"prescott", PROCESSOR_PENTIUM4, PTA_SSE | PTA_SSE2 | PTA_SSE3

1119 | PTA_MMX | PTA_PREFETCH_SSE},

1120 {"nocona", PROCESSOR_PENTIUM4, PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_64BIT

1121 | PTA_MMX | PTA_PREFETCH_SSE},

1122 {"k6", PROCESSOR_K6, PTA_MMX},

1123 {"k6-2", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},

1124 {"k6-3", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},

1125 {"athlon", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW

1126 | PTA_3DNOW_A},

1127 {"athlon-tbird", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE

1128 | PTA_3DNOW | PTA_3DNOW_A},

1129 {"athlon-4", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW

1130 | PTA_3DNOW_A | PTA_SSE},

1131 {"athlon-xp", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW

1132 | PTA_3DNOW_A | PTA_SSE},

1133 {"athlon-mp", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW

1134 | PTA_3DNOW_A | PTA_SSE},

1135 {"x86-64", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_64BIT

1136 | PTA_SSE | PTA_SSE2 },

1137 {"k8", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT

1138 | PTA_3DNOW_A | PTA_SSE | PTA_SSE2},

1139 {"opteron", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT

1140 | PTA_3DNOW_A | PTA_SSE | PTA_SSE2},

1141 {"athlon64", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT

1142 | PTA_3DNOW_A | PTA_SSE | PTA_SSE2},

1143 {"athlon-fx", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT

1144 | PTA_3DNOW_A | PTA_SSE | PTA_SSE2},

1145 };

1146

1147 int const pta_size = ARRAY_SIZE (processor_alias_table);

上面，processor_target_table和processor_alias_table的类型定义在它们之前，因此这些类型不能用于别处。在1080行的TARGET_CPU_DEFAULT_NAMES定义了CPU族。

710 #define TARGET_CPU_DEFAULT_NAMES {"i386", "i486", "pentium", "pentium-mmx",/

711 "pentiumpro", "pentium2", "pentium3", /

712 "pentium4", "k6", "k6-2", "k6-3",/

713 "athlon", "athlon-4", "k8", /

714 "pentium-m", "prescott", "nocona"}

在1085行的pta_flags描述了特定芯片上可用的寄存器集的属性。

override_options (continue)

1149 /* Set the default values for switches whose default depends on TARGET_64BIT

1150 in case they weren't overwritten by command line options. */

1151 if (TARGET_64BIT)

1152 {

1153 if (flag_omit_frame_pointer == 2)

1154 flag_omit_frame_pointer = 1;

1155 if (flag_asynchronous_unwind_tables == 2)

1156 flag_asynchronous_unwind_tables = 1;

1157 if (flag_pcc_struct_return == 2)

1158 flag_pcc_struct_return = 0;

1159 }

1160 else

1161 {

1162 if (flag_omit_frame_pointer == 2)

1163 flag_omit_frame_pointer = 0;

1164 if (flag_asynchronous_unwind_tables == 2)

1165 flag_asynchronous_unwind_tables = 0;

1166 if (flag_pcc_struct_return == 2)

1167 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;

1168 }

1169

1170 #ifdef SUBTARGET_OVERRIDE_OPTIONS

1171 SUBTARGET_OVERRIDE_OPTIONS;

1172 #endif

1173

1174 if (!ix86_tune_string && ix86_arch_string)

1175 ix86_tune_string = ix86_arch_string;

1176 if (!ix86_tune_string)

1177 ix86_tune_string = cpu_names [TARGET_CPU_DEFAULT];

1178 if (!ix86_arch_string)

1179 ix86_arch_string = TARGET_64BIT ? "x86-64" : "i386";

1180

1181 if (ix86_cmodel_string != 0)

1182 {

1183 if (!strcmp (ix86_cmodel_string, "small"))

1184 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;

1185 else if (flag_pic)

1186 sorry ("code model %s not supported in PIC mode", ix86_cmodel_string);

1187 else if (!strcmp (ix86_cmodel_string, "32"))

1188 ix86_cmodel = CM_32;

1189 else if (!strcmp (ix86_cmodel_string, "kernel") && ! flag_pic)

1190 ix86_cmodel = CM_KERNEL;

1191 else if (!strcmp (ix86_cmodel_string, "medium") && ! flag_pic)

1192 ix86_cmodel = CM_MEDIUM;

1193 else if (!strcmp (ix86_cmodel_string, "large") && ! flag_pic)

1194 ix86_cmodel = CM_LARGE;

1195 else

1196 error ("bad value (%s) for -mcmodel= switch", ix86_cmodel_string);

1197 }

1198 else

1199 {

1200 ix86_cmodel = CM_32;

1201 if (TARGET_64BIT)

1202 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;

1203 }

1204 if (ix86_asm_string != 0)

1205 {

1206 if (!strcmp (ix86_asm_string, "intel"))

1207 ix86_asm_dialect = ASM_INTEL;

1208 else if (!strcmp (ix86_asm_string, "att"))

1209 ix86_asm_dialect = ASM_ATT;

1210 else

1211 error ("bad value (%s) for -masm= switch", ix86_asm_string);

1212 }

1213 if ((TARGET_64BIT == 0) != (ix86_cmodel == CM_32))

1214 error ("code model `%s' not supported in the %s bit mode",

1215 ix86_cmodel_string, TARGET_64BIT ? "64" : "32");

1216 if (ix86_cmodel == CM_LARGE)

1217 sorry ("code model `large' not supported yet");

1218 if ((TARGET_64BIT != 0) != ((target_flags & MASK_64BIT) != 0))

1219 sorry ("%i-bit mode not compiled in",

1220 (target_flags & MASK_64BIT) ? 64 : 32);

上面1167行，对于x86芯片DEFAULT_PCC_STRUCT_RETURN定义为1。而1177行的TARGET_CPU_DEFAULT将选定默认芯片，对于32位芯片，它的值是0（因而选用最普通的名字“i386”）。另在1170行的SUBTARGET_OVERRIDE_OPTIONS对于x86芯片是没有定义的。

对于那些形如ix86_*_string的变量，我们在set_target_switch中已经看到对它们的赋值。【6】对相关的选项给出如下的描述（针对x86体系）。

-mtune=cpu-type

除了ABI及可用指令集，根据cpu-type，对产生的代码作出所有合适的调整。可选则的cpu-type有：

generic 为最通用的IA32/AMD64/EM64T处理器产生优化代码。如果你知道你的代码所运行的CPU，那么你应该所有相应的-mtune选项而不是-mtune=generic。但是，如果你不确切知道你的应用的用户所用的CPU，那么你应该所有这个选项。

因为新的芯片发布后，这个选项的行为将有改变。因此，如果你升级到新版本的GCC，代码生成选项将改变以反映该版本GCC发布时最通用的处理器。

没有选项-march=generic，因为-march表示编译器可使用的指令集，而没有一个通用的指令集可以用于所有的处理器。相对的，-mtune表示代码为该处理器（或者，在我们这种情况下，一组处理器）所优化。

native 通过在编译时刻确定编译机器的处理器类型，选择为之调整代码的CPU。使用-mtune=native将，在所选的指令集的限制下，产生为本地机器优化的代码。使用-march=native将使能本地机器所支持的所有指令子集（因此编译结果不能运行在别的机器上）。

i386 原始的 Intel i386 CPU.

i486 Intel i486 CPU。（该芯片没有实现调度（scheduling））

i586，pentium

没有MMX的Intel Pentium CPU。

pentium-mmx

基于Pentium 核心，支持MMX指令集的Intel PentiumMMX CPU。

pentiumpro

Intel PentiumPro CPU。

i686 和通用同义，但当用作march选项时，使用PentiumPro指令集，因此代码将运行在所有i686芯片族。

pentium2 基于PentiumPro 核心，支持MMX指令集的Intel Pentium2 CPU。

pentium3，pentium3m

基于PentiumPro 核心，支持MMX和SSE 指令集的Intel Pentium3 CPU。

pentium-m

支持MMX，SSE和SSE2 指令集的低功耗的Intel Pentium3 CPU。

pentium4，pentium4m

支持MMX，SSE和SSE2 指令集的Intel Pentium4 CPU。

prescott 支持MMX，SSE，SSE2和SSE3 指令集的改进Intel Pentium4 CPU。

nocona 支持MMX，SSE，SSE2和SSE3 指令集及64位扩展的改进Intel Pentium4 CPU。

core2 支持MMX，SSE，SSE2和SSE3 指令集及64位扩展的Intel Core2 CPU。

k6 支持MMX指令集的AMD K6 CPU。

k6-2，k6-3 支持MMX和3dNOW!指令集的改进AMD K6 CPU。

Athlon，athlon-tbird

支持MMX，3dNOW!，增强3dNOW!及SSE预取指令集的AMD Athlon CPU。

athlon-4，athlon-xp，athlon-mp

支持MMX，3dNOW!，增强3dNOW!及SSE完整指令集的改进AMD Athlon CPU

k8，opteron，athlon64，athlon-fx

基于AMD K8核心，支持x86-64指令集的CPU（这是MMX，SSE，SSE2，3dNOW!，增强3dNOW!和64-bit指令集扩展的超集）。

k8-sse3，opteron-sse3，athlon64-sse3

支持SSE3指令集的改进k8，opteron和athlon64。

amdfam10，barcelona

基于AMD Family 10h核心，支持x86-64指令集的CPU（这是MMX，SSE，SSE2，3dNOW!，增强3dNOW!和64-bit指令集扩展的超集）。

winchip-c6

IDT Winchip C6 CPU，与支持MMX指令集的i486处理相同。

winchip2 IDT Winchip2 CPU，与支持MMX和3dNOW!指令集的i486处理相同。

c3 支持MMX和3dNOW!指令集的C3 CPU（该芯片没有实现调度）。

c3-2 支持MMX和SSE指令集的C3-2 CPU（该芯片没有实现调度）。

geode 支持MMX和3dNOW!指令集的嵌入式AMD CPU。

选定特定的cpu类型将作出对该芯片合适的安排，除非使用了-march=cpu-type 选项，编译器不会产生在i386以外运行的代码。

-march=cpu-type

为机器类型为cpu-type的机器产生代码。Cpu类型的选择与-mtune选项相同。更进一步，指定-march=cpu-type隐含-mtune=cpu-type。

-mcpu=cpu-type

已过时的-mtune的同义词。

-masm=dialect

使用选定的方言输出汇编指令。支持的选择有intel或att（默认值）。Darwin不支持intel。

除了上面所说的，在64位环境中，AMD x86-64 处理器还支持以下-m选项。

-m32 -m64

为32位或64位环境产生代码。32位环境设置int，long及指针为32位大小，并产生能在任意i386系统上运行的代码。64位环境设置int为32位大小，而long和指针为64位大小，并为AMD’s x86-64架构产生代码。对于darwin仅-m64选项会关闭-fno-pic及-mdynamic-no-pic选项。

-mno-red-zone

对于x86-64的代码，不要使用所谓的红区（red zone）。红区是x86-64 ABI的要求，它是一个在栈指针位置以外，不会被信号及异常句柄所修改的128字节大小的区域，因此它可被用于保存临时变量而不需调整栈指针。设置标识符-mno-red-zone，则禁用红区。

-mcmodel=small

为小代码模式产生代码：程序及其符号必须被链接入地址空间的低2 GB部分。指针是64 位大小。程序可以被静态或动态链接。这是默认的代码模式（the default code model）。

-mcmodel=kernel

为内核代码模式产生代码。内核运行在地址空间的高2 GB部分。这个模式为Linux内核代码使用。

-mcmodel=medium

为中等模式产生代码：程序被链接入地址空间的低2 GB，但符号可以位于地址空间的任一处。程序可以被静态或动态链接，但中等模式不支持创建共享库。

-mcmodel=large

为大模式产生代码：这个模式不对段的大小及地址做任何假设。

上面的段落解释了下面cmodel的含义。

108 enum cmodel { in i386.h

109 CM_32, /* The traditional 32-bit ABI. */

110 CM_SMALL, /* Assumes all code and data fits in the low 31 bits. */

111 CM_KERNEL, /* Assumes all code and data fits in the high 31 bits. */

112 CM_MEDIUM, /* Assumes code fits in the low 31 bits; data unlimited. */

113 CM_LARGE, /* No assumptions. */

114 CM_SMALL_PIC /* Assumes code+data+got/plt fits in a 31 bit region. */

115 };

override_options (continue)

1222 for (i = 0; i < pta_size; i++)

1223 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))

1224 {

1225 ix86_arch = processor_alias_table[i].processor;

1226 /* Default cpu tuning to the architecture. */

1227 ix86_tune = ix86_arch;

1228 if (processor_alias_table[i].flags & PTA_MMX

1229 && !(target_flags_explicit & MASK_MMX))

1230 target_flags |= MASK_MMX;

1231 if (processor_alias_table[i].flags & PTA_3DNOW

1232 && !(target_flags_explicit & MASK_3DNOW))

1233 target_flags |= MASK_3DNOW;

1234 if (processor_alias_table[i].flags & PTA_3DNOW_A

1235 && !(target_flags_explicit & MASK_3DNOW_A))

1236 target_flags |= MASK_3DNOW_A;

1237 if (processor_alias_table[i].flags & PTA_SSE

1238 && !(target_flags_explicit & MASK_SSE))

1239 target_flags |= MASK_SSE;

1240 if (processor_alias_table[i].flags & PTA_SSE2

1241 && !(target_flags_explicit & MASK_SSE2))

1242 target_flags |= MASK_SSE2;

1243 if (processor_alias_table[i].flags & PTA_SSE3

1244 && !(target_flags_explicit & MASK_SSE3))

1245 target_flags |= MASK_SSE3;

1246 if (processor_alias_table[i].flags & PTA_PREFETCH_SSE)

1247 x86_prefetch_sse = true;

1248 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))

1249 error ("CPU you selected does not support x86-64 instruction set");

1250 break;

1251 }

1252

1253 if (i == pta_size)

1254 error ("bad value (%s) for -march= switch", ix86_arch_string);

1255

1256 for (i = 0; i < pta_size; i++)

1257 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))

1258 {

1259 ix86_tune = processor_alias_table[i].processor;

1260 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))

1261 error ("CPU you selected does not support x86-64 instruction set");

1262

1263 /* Intel CPUs have always interpreted SSE prefetch instructions as

1264 NOPs; so, we can enable SSE prefetch instructions even when

1265 -mtune (rather than -march) points us to a processor that has them.

1266 However, the VIA C3 gives a SIGILL, so we only do that for i686 and

1267 higher processors. */

1268 if (TARGET_CMOVE && (processor_alias_table[i].flags & PTA_PREFETCH_SSE))

1269 x86_prefetch_sse = true;

1270 break;

1271 }

1272 if (i == pta_size)

1273 error ("bad value (%s) for -mtune= switch", ix86_tune_string);

上面target_flags_explicit也是在set_target_switch中设置的。它记录了特定的选项是否打开。因此，编译器可以根据processor_alias_table的知识进行自动补齐。

override_options (continue)

1275 if (optimize_size)

1276 ix86_cost = &size_cost;

1277 else

1278 ix86_cost = processor_target_table[ix86_tune].cost;

1279 target_flags |= processor_target_table[ix86_tune].target_enable;

1280 target_flags &= ~processor_target_table[ix86_tune].target_disable;

1281

1282 /* Arrange to set up i386_stack_locals for all functions. */

1283 init_machine_status = ix86_init_machine_status;

1284

1285 /* Validate -mregparm= value. */

1286 if (ix86_regparm_string)

1287 {

1288 i = atoi (ix86_regparm_string);

1289 if (i < 0 || i > REGPARM_MAX)

1290 error ("-mregparm=%d is not between 0 and %d", i, REGPARM_MAX);

1291 else

1292 ix86_regparm = i;

1293 }

1294 else

1295 if (TARGET_64BIT)

1296 ix86_regparm = REGPARM_MAX;

1297

1298 /* If the user has provided any of the -malign-* options,

1299 warn and use that value only if -falign-* is not set.

1300 Remove this code in GCC 3.2 or later. */

1301 if (ix86_align_loops_string)

1302 {

1303 warning ("-malign-loops is obsolete, use -falign-loops");

1304 if (align_loops == 0)

1305 {

1306 i = atoi (ix86_align_loops_string);

1307 if (i < 0 || i > MAX_CODE_ALIGN)

1308 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);

1309 else

1310 align_loops = 1 << i;

1311 }

1312 }

1313

1314 if (ix86_align_jumps_string)

1315 {

1316 warning ("-malign-jumps is obsolete, use -falign-jumps");

1317 if (align_jumps == 0)

1318 {

1319 i = atoi (ix86_align_jumps_string);

1320 if (i < 0 || i > MAX_CODE_ALIGN)

1321 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);

1322 else

1323 align_jumps = 1 << i;

1324 }

1325 }

1326

1327 if (ix86_align_funcs_string)

1328 {

1329 warning ("-malign-functions is obsolete, use -falign-functions");

1330 if (align_functions == 0)

1331 {

1332 i = atoi (ix86_align_funcs_string);

1333 if (i < 0 || i > MAX_CODE_ALIGN)

1334 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);

1335 else

1336 align_functions = 1 << i;

1337 }

1338 }

1339

1340 /* Default align_* from the processor table. */

1341 if (align_loops == 0)

1342 {

1343 align_loops = processor_target_table[ix86_tune].align_loop;

1344 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;

1345 }

1346 if (align_jumps == 0)

1347 {

1348 align_jumps = processor_target_table[ix86_tune].align_jump;

1349 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;

1350 }

1351 if (align_functions == 0)

1352 {

1353 align_functions = processor_target_table[ix86_tune].align_func;

1354 }

1355

1356 /* Validate -mpreferred-stack-boundary= value, or provide default.

1357 The default of 128 bits is for Pentium III's SSE __m128, but we

1358 don't want additional code to keep the stack aligned when

1359 optimizing for code size. */

1360 ix86_preferred_stack_boundary = (optimize_size

1361 ? TARGET_64BIT ? 128 : 32

1362 : 128);

1363 if (ix86_preferred_stack_boundary_string)

1364 {

1365 i = atoi (ix86_preferred_stack_boundary_string);

1366 if (i < (TARGET_64BIT ? 4 : 2) || i > 12)

1367 error ("-mpreferred-stack-boundary=%d is not between %d and 12", i,

1368 TARGET_64BIT ? 4 : 2);

1369 else

1370 ix86_preferred_stack_boundary = (1 << i) * BITS_PER_UNIT;

1371 }

1372

1373 /* Validate -mbranch-cost= value, or provide default. */

1374 ix86_branch_cost = processor_target_table[ix86_tune].cost->branch_cost;

1375 if (ix86_branch_cost_string)

1376 {

1377 i = atoi (ix86_branch_cost_string);

1378 if (i < 0 || i > 5)

1379 error ("-mbranch-cost=%d is not between 0 and 5", i);

1380 else

1381 ix86_branch_cost = i;

1382 }

1383

1384 if (ix86_tls_dialect_string)

1385 {

1386 if (strcmp (ix86_tls_dialect_string, "gnu") == 0)

1387 ix86_tls_dialect = TLS_DIALECT_GNU;

1388 else if (strcmp (ix86_tls_dialect_string, "sun") == 0)

1389 ix86_tls_dialect = TLS_DIALECT_SUN;

1390 else

1391 error ("bad value (%s) for -mtls-dialect= switch",

1392 ix86_tls_dialect_string);

1393 }

对于Intel x86，在上面代码中所引用的目标选项中，那些在intel芯片中可用的由【6】给出如下。上面，REGPARM_MAX对于32位芯片是3，而MAX_CODE_ALIGN在函数的开头是16。

-mregparm=num

控制多少个寄存器用于传递整型参数。默认情况下，没有寄存器用于传递参数，并且最多可以使用3个寄存器。可以通过使用函数属性regparm来控制特定函数的这个行为。

警告：如果使用这个选项，并且num非0，那么使用同样的值来编译所有的模块，包括所有的库。这里面包括系统库及启动模块。

-mpreferred-stack-boundary=num

尝试保持栈对齐在2的num指数倍字节的边界。如果没有指明-mpreferred-stack-boundary，默认值是4（16字节或128比特）。

在Pentium及PentiumPro上，double和long double类型的值应该对齐在8字节的边界（参考-malign-double）或者忍受显著的性能下降。在Pentium III上， SIMD扩展(SSE) 流数据类型__m128如果不在16字节上对齐，可能不能正常工作。

为了确保在栈中这个值的正确对齐，栈的边界必须满足栈中保存的任意值的对齐要求。更甚，每个产生的函数都需要保证栈被对齐。因此，由一个使用较小栈边界编译的函数调用一个使用较大栈边界编译的函数，很可能会导致栈失调（misalign）。建议使用回调的库应一直使用默认设置。

这个额外的对齐会消耗额外的栈空间，并且通常会增加代码大小。对栈空间的使用敏感的代码，比如嵌入式系统及操作系统内核，可能希望减少期望的对齐到-mpreferred-stack-boundary=2。

override_options (continue)

1395 /* Keep nonleaf frame pointers. */

1396 if (TARGET_OMIT_LEAF_FRAME_POINTER)

1397 flag_omit_frame_pointer = 1;

1398

1399 /* If we're doing fast math, we don't care about comparison order

1400 wrt NaNs. This lets us use a shorter comparison sequence. */

1401 if (flag_unsafe_math_optimizations)

1402 target_flags &= ~MASK_IEEE_FP;

1403

1404 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,

1405 since the insns won't need emulation. */

1406 if (x86_arch_always_fancy_math_387 & (1 << ix86_arch))

1407 target_flags &= ~MASK_NO_FANCY_MATH_387;

1408

1409 /* Turn on SSE2 builtins for -msse3. */

1410 if (TARGET_SSE3)

1411 target_flags |= MASK_SSE2;

1412

1413 /* Turn on SSE builtins for -msse2. */

1414 if (TARGET_SSE2)

1415 target_flags |= MASK_SSE;

1416

1417 if (TARGET_64BIT)

1418 {

1419 if (TARGET_ALIGN_DOUBLE)

1420 error ("-malign-double makes no sense in the 64bit mode");

1421 if (TARGET_RTD)

1422 error ("-mrtd calling convention not supported in the 64bit mode");

1423 /* Enable by default the SSE and MMX builtins. */

1424 target_flags |= (MASK_SSE2 | MASK_SSE | MASK_MMX | MASK_128BIT_LONG_DOUBLE);

1425 ix86_fpmath = FPMATH_SSE;

1426 }

1427 else

1428 {

1429 ix86_fpmath = FPMATH_387;

1430 /* i386 ABI does not specify red zone. It still makes sense to use it

1431 when programmer takes care to stack from being destroyed. */

1432 if (!(target_flags_explicit & MASK_NO_RED_ZONE))

1433 target_flags |= MASK_NO_RED_ZONE;

1434 }

1435

1436 if (ix86_fpmath_string != 0)

1437 {

1438 if (! strcmp (ix86_fpmath_string, "387"))

1439 ix86_fpmath = FPMATH_387;

1440 else if (! strcmp (ix86_fpmath_string, "sse"))

1441 {

1442 if (!TARGET_SSE)

1443 {

1444 warning ("SSE instruction set disabled, using 387 arithmetics");

1445 ix86_fpmath = FPMATH_387;

1446 }

1447 else

1448 ix86_fpmath = FPMATH_SSE;

1449 }

1450 else if (! strcmp (ix86_fpmath_string, "387,sse")

1451 || ! strcmp (ix86_fpmath_string, "sse,387"))

1452 {

1453 if (!TARGET_SSE)

1454 {

1455 warning ("SSE instruction set disabled, using 387 arithmetics");

1456 ix86_fpmath = FPMATH_387;

1457 }

1458 else if (!TARGET_80387)

1459 {

1460 warning ("387 instruction set disabled, using SSE arithmetics");

1461 ix86_fpmath = FPMATH_SSE;

1462 }

1463 else

1464 ix86_fpmath = FPMATH_SSE | FPMATH_387;

1465 }

1466 else

1467 error ("bad value (%s) for -mfpmath= switch", ix86_fpmath_string);

1468 }

上面的ix86_fpmath_string用于下列选项【6】。

-mfpmath=unit

为选定单元产生浮点算术。单元的选择有：

387 使用在大多数芯片及模拟器都存在的标准387浮点协处理器。用此选项编译的代码几乎可以运行在任何地方。临时结果以80位精度计算，而不是以类型所指定的精度。与其他大部分芯片比较，这将导致微小的差异。更多细节，参考-ffloat-store对于i386编译器，这是默认的选择。

sse 使用SSE指令集中的纯量浮点指令（scalar floating point instruction）。这个指令集为Pentium3及更新的芯片支持，在AMD系列，则为Athlon-4，Athlon-xp及Athlon-mp 支持。更早版本的SSE指令集仅支持单精度算术，因此双精度及扩展精度仍需使用387。更新的版本，仅出现在Pentium4及未来的AMD x86-64芯片，支持双精度算术。

对于i386编译器，需要使用-march=cpu-type，-msse或-msse2选项来启动SSE 扩展以使得该选项生效。对于x86-64编译器，这些扩展默认就是生效的。

在大多数情况下，生成的代码要快得多，并且避免了387代码数值不稳定的问题，但可能会破坏一些期望临时结果为80位的现存代码。这是x86-64编译器的默认的选择。

sse，387 尝试同时应用两者指令集。这有效地加倍了可用的寄存器，而且在具有分立的387及SSE执行单元的芯片上，亦增加了执行资源。使用该选择需小心，因为它仍在试验中，GCC寄存器分配器不能很好地为分立功能单元建模，也导致性能不稳定。

override_options (continue)

1470 /* It makes no sense to ask for just SSE builtins, so MMX is also turned

1471 on by -msse. */

1472 if (TARGET_SSE)

1473 {

1474 target_flags |= MASK_MMX;

1475 x86_prefetch_sse = true;

1476 }

1477

1478 /* If it has 3DNow! it also has MMX so MMX is also turned on by -m3dnow */

1479 if (TARGET_3DNOW)

1480 {

1481 target_flags |= MASK_MMX;

1482 /* If we are targeting the Athlon architecture, enable the 3Dnow/MMX

1483 extensions it adds. */

1484 if (x86_3dnow_a & (1 << ix86_arch))

1485 target_flags |= MASK_3DNOW_A;

1486 }

1487 if ((x86_accumulate_outgoing_args & TUNEMASK)

1488 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)

1489 && !optimize_size)

1490 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;

1491

1492 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */

1493 {

1494 char *p;

1495 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);

1496 p = strchr (internal_label_prefix, 'X');

1497 internal_label_prefix_len = p - internal_label_prefix;

1498 *p = '/0';

1499 }

1500 }

在上面的1487行，x86_accumulate_outgoing_args定义如下：

507 const int x86_accumulate_outgoing_args = m_ATHLON_K8 | m_PENT4 | m_PPRO; in i386.c

其中，例如，m_ATHLON_K8具有以下定义。

470 #define m_K8 (1<<PROCESSOR_K8) in i386.c

471 #define m_ATHLON_K8 (m_K8 | m_ATHLON)

PROCESSOR_K8是枚举类型processor_type的其中一个值。显然，变量，形如x86_accumulate_outgoing_args，定义了具有指定特性的芯片。

在1495行，在Linux上，ASM_GENERATE_INTERNAL_LABEL被定义为：

213 #undef ASM_GENERATE_INTERNAL_LABEL in linux.h

214 #define ASM_GENERATE_INTERNAL_LABEL(LABEL,PREFIX,NUM) /

215 sprintf (LABEL, "*.L%s%ld", PREFIX, (long)(NUM))

因此internal_label_prefix将是 “*.LLX”（在1498 行‘/0’将替换‘0’）。

秒客网

GCC-3.4.6源代码学习笔记（39）

4.1.4. 根据目标平台调整选项

相关文章