关于double和float类型的定义,网上有很多相关的资料,这里仅贴出来链接:/cd/E19957-01/806-3568/ncg_math.htmlhttps:///cd/E19957-01/806-3568/ncg_math.html
IEEE754浮点数标准及浮点型和整型之间的转换_a3192048的博客-****博客_ieee754浮点数转换/a3192048/article/details/106662693?spm=1001.2101.3001.6661.1&utm_medium=distribute.pc_relevant_t0.none-task-blog-2%7Edefault%7EOPENSEARCH%7Edefault-1.no_search_link&depth_1-utm_source=distribute.pc_relevant_t0.none-task-blog-2%7Edefault%7EOPENSEARCH%7Edefault-1.no_search_link
浮点数由三部分组成、符号位、指数位、尾数位。
浮点数的特殊数需要单独考虑
nan(指数位均为1,尾数位不全为0)
inf(指数位均为1,尾数位全为0)
denormal(指数位全为0)
round to nearest even(为了防止每次舍入造成数据分布的整体变化,IEEE设计了nearest to even舍入模式,根据浮点数据判断最后一位是否进位1。这里有两种情况:1、当舍弃数大于0x1000...,或2、浮点数最后一位为1,并且舍弃部分恰好等于0x1000...)
下面开始计算流程:
-
符号位,右移63位再左移23位到float的符号位
-
dst += data & 0x8000 0000 0000 0000 >> 63 << 23
-
-
指数位,
exp_f = exp_d - 1023 + 127
-
exp_d = data & 0x7ff0 0000 0000 0000 >> 52
,取得指数位 -
exp_d in [874, 896]
的部分需要记录,denormal_shift = min(max(-exp_f + 1, 0), 23)
,后面用来操作非规格数。 -
初始化一个
tail_mask
用来生成尾数掩码,tail_mask=0x007ffff
-
其中指数位
exp_f > 255
为溢出数,溢出到inf,尾数将要写0,生成tail_mask = exp_f > 255 ? tail_mask : 0
-
指数位
exp_f < -23
也为溢出数,溢出到0,尾数将要写0,或运算2.3tail_mask = exp_f < -23 ? tail_mask : 0
-
之后指数位限制在[0, 255]之间,
exp = min(max(exp_f, 255), 0)
-
dst += exp_f << 23
-
double指数位为0x7ff的有可能为nan,需要结合尾数位来判断
-
-
尾数位
-
data & 0x000f ffff ffff ffff,取得double尾数位
-
如果尾数位均为0,与2.7判别出nan,得到进位位
nan_mask = exp_d == 0x7ff && tail == 0x0 ? 0x400000 : 0
-
尾数位右移
29 + denormal_shift
位之后,与操作2.3,2.4的结果 -
dst += tail
-
-
denormal位
-
dst += (0x0080000 >> denormal_shift) & 0x007ffff
-
-
rn位
-
准备float最后一个尾数位,
last_bit = 0x00000001 & dst
-
准备double截掉的位,
rn_mask = ((0x0000000020000000 << denormal_shift) - 1) & d_a
-
准备double待比较数,
rn_base = (0x0000000010000000 << denormal_shift)
-
进位准则1:
rn_mask > rn_base ? 1 : 0
-
进位准则2:
rn_mask == rn_base && last_bit ? 1 : 0
-
两个进位准则与2.3的做与,判断是否实际进位,tail_mask
-
dst += (judge1 + judge2) & tail_mask
-
-
nan位
-
现在nan溢出到inf了,需要和3.2的nan掩码相加,
dst += nan_mask
-
#include <iostream>
#include <algorithm>
#define U64 uint64_t
#define U32 uint32_t
#define GREEN "\033[0;32;32m"
#define RED "\033[0;32;31m"
#define NONE "\033[0m"
//64左移len 位
U64 move_left64(U64 a, int len) {
return a * (0x1 << len);
}
//64右移len 位
U64 move_right64(U64 a, int len) {
return a / (0x1 << len);
}
float test(double d_a) {
uint32_t dst = 0x000000;
float f_a = (float)d_a;
uint64_t sign_d_a = (*((int64_t *)&d_a) & (0x8000000000000000)) >> 63;
uint32_t sign_f_a = sign_d_a << 31;
dst += sign_f_a;
uint64_t exp_d_a = (*((int64_t *)&d_a) & (0x7ff0000000000000)) >> 52;
uint32_t exp_f_a = std::max(std::min((int32_t)exp_d_a - 1023 + 127, 0xff), 0x0) << 23;
int32_t denormal_shift = std::min(std::max(-((int32_t)exp_d_a - 1023 + 127) + 1, 0x0), 23);
dst += exp_f_a;
int32_t tail_mask = (exp_d_a > 1151 || exp_d_a < 874) ? 0x0 : 0x007fffff;
uint32_t denormal_bit = move_right64(0x800000, denormal_shift) & tail_mask;
dst += denormal_bit;
uint64_t tail_d_a = (*((int64_t *)&d_a) & (0x000fffffffffffff));
uint32_t tail_f_a = (tail_d_a >> 29 >> denormal_shift) & tail_mask;
dst += tail_f_a;
int64_t rn_mask = (move_left64(0x0000000020000000, denormal_shift) - 1);
int64_t rn_base = (move_left64(0x0000000010000000, denormal_shift));
uint64_t rn = tail_d_a & rn_mask;
uint32_t rn_up_judge1 = ((rn > rn_base) ? 0x1 : 0x0) & tail_mask;
dst += rn_up_judge1;
uint64_t last_bit = 0x00000001 & dst;
uint32_t rn_up_judge2 = ((rn == rn_base && last_bit) ? 0x1 : 0x0) & tail_mask;
dst += rn_up_judge2;
uint32_t nan_mask = (exp_d_a == 0x7ff && tail_d_a != 0) ? 0x400000 : 0x0;
dst += nan_mask;
float reference = (float)d_a;
if (*((int32_t *)&reference) == dst) {
printf("dst: %x, %f, line: %d\n", dst, *((float *)&dst), __LINE__);
printf("ref: %x, %f, line: %d\n", *((int32_t *)&reference), reference, __LINE__);
printf(GREEN "pass!" NONE "\n");
} else {
float reference = (float)d_a;
printf("sign d: %lld\t %llx\t\t\t line: %d\n", sign_d_a, sign_d_a, __LINE__);
printf("sign f: %d\t %x\t\t\t line: %d\n", sign_f_a, sign_f_a, __LINE__);
printf("dst f: %2.3f\t %x\t\t\t line: %d\n", *((float *)&dst), dst, __LINE__);
printf("*******\n");
printf("exp d: %lld\t %llx\t\t\t line: %d\n", exp_d_a, exp_d_a, __LINE__);
printf("exp f: %d\t %x\t\t\t line: %d\n", exp_f_a, exp_f_a, __LINE__);
printf("denormal shift:\t %d, line: %d\n", denormal_shift, __LINE__);
printf("dst f: %2.3f\t %x\t\t\t line: %d\n", *((float *)&dst), dst, __LINE__);
printf("*******\n");
printf("tail_mask:\t %x, line: %d\n", tail_mask, __LINE__);
printf("denormal bit:\t %x\n", denormal_bit);
printf("dst f: %2.3f\t %x\t\t\t line: %d\n", *((float *)&dst), dst, __LINE__);
printf("*******\n");
printf("tail f a: %x\t\t\t\t line: %d\n", tail_f_a, __LINE__);
printf("dst f: %2.3f\t %x\t\t\t line: %d\n", *((float *)&dst), dst, __LINE__);
printf("rn_up_judge1: %x\t\t\t\t line: %d\n", rn_up_judge1, __LINE__);
printf("rn_up_judge2: %x\t\t\t\t line: %d\n", rn_up_judge2, __LINE__);
printf("dst f: %2.3f\t %x\t\t\t line: %d\n", *((float *)&dst), dst, __LINE__);
printf("nan mask: %d\t %x\t\t\t line: %d\n", nan_mask, nan_mask, __LINE__);
printf("dst f: %2.3f\t %x\t\t\t line: %d\n", *((float *)&dst), dst, __LINE__);
printf("dst: %x, %f, line: %d\n", dst, *((float *)&dst), __LINE__);
printf("ref: %x, %f, line: %d\n", *((int32_t *)&reference), reference, __LINE__);
printf(RED "wrong!\n" NONE);
}
return *((float *)&dst);
}
int main() {
int64_t from = 0x7Ff0000000000006;
double test_value = *((double *)&from);
test(test_value);
from = 0x36a7000000000000;
test_value = *((double *)&from);
test(test_value);
from = 0x36a8000000000000;
test_value = *((double *)&from);
test(test_value);
}
再贴几个有意思的网站:
IEEE-754 Floating Point Converter,直观显示float每一位的
Base Convert: IEEE 754 Floating Point,研究double2float流程的