# Blog: Playing with the CPU pipeline: poly.cpp

1 | #include <chrono> |

2 | #include <cmath> |

3 | #include <iostream> |

4 | #include <iomanip> |

5 | #include <cassert> |

6 | |

7 | using namespace std; |

8 | |

9 | double sin1(double x) __attribute__((noinline)); |

10 | double sin2(double x) __attribute__((noinline)); |

11 | double sin3(double x) __attribute__((noinline)); |

12 | double sin4(double x) __attribute__((noinline)); |

13 | double sin5(double x) __attribute__((noinline)); |

14 | double sin6(double x) __attribute__((noinline)); |

15 | double sin7(double x) __attribute__((noinline)); |

16 | |

17 | static double const a0 = +1.0; |

18 | static double const a1 = -1.666666666666580809419428987894207e-1; |

19 | static double const a2 = +8.333333333262716094425037738346873e-3; |

20 | static double const a3 = -1.984126982005911439283646346964929e-4; |

21 | static double const a4 = +2.755731607338689220657382272783309e-6; |

22 | static double const a5 = -2.505185130214293595900283001271652e-8; |

23 | static double const a6 = +1.604729591825977403374012010065495e-10; |

24 | static double const a7 = -7.364589573262279913270651228486670e-13; |

25 | |

26 | double sin1(double x) |

27 | { |

28 | return x * a0 |

29 | + x * x * x * a1 |

30 | + x * x * x * x * x * a2 |

31 | + x * x * x * x * x * x * x * a3 |

32 | + x * x * x * x * x * x * x * x * x * a4 |

33 | + x * x * x * x * x * x * x * x * x * x * x * a5 |

34 | + x * x * x * x * x * x * x * x * x * x * x * x * x * a6 |

35 | + x * x * x * x * x * x * x * x * x * x * x * x * x * x * x * a7; |

36 | } |

37 | |

38 | double sin2(double x) |

39 | { |

40 | double ret = 0.0; |

41 | double y = x; |

42 | double x2 = x * x; |

43 | ret += a0 * y; y *= x2; |

44 | ret += a1 * y; y *= x2; |

45 | ret += a2 * y; y *= x2; |

46 | ret += a3 * y; y *= x2; |

47 | ret += a4 * y; y *= x2; |

48 | ret += a5 * y; y *= x2; |

49 | ret += a6 * y; y *= x2; |

50 | ret += a7 * y; |

51 | return ret; |

52 | } |

53 | |

54 | double sin3(double x) |

55 | { |

56 | double x2 = x * x; |

57 | return x * (a0 + x2 * (a1 + x2 * (a2 + x2 * (a3 + x2 * (a4 + x2 * (a5 + x2 * (a6 + x2 * a7))))))); |

58 | } |

59 | |

60 | double sin4(double x) |

61 | { |

62 | double x2 = x * x; |

63 | double x4 = x2 * x2; |

64 | double A = a0 + x4 * (a2 + x4 * (a4 + x4 * a6)); |

65 | double B = a1 + x4 * (a3 + x4 * (a5 + x4 * a7)); |

66 | return x * (A + x2 * B); |

67 | } |

68 | |

69 | double sin5(double x) |

70 | { |

71 | double x2 = x * x; |

72 | double x4 = x2 * x2; |

73 | double x6 = x4 * x2; |

74 | double A = a0 + x6 * (a3 + x6 * a6); |

75 | double B = a1 + x6 * (a4 + x6 * a7); |

76 | double C = a2 + x6 * a5; |

77 | return x * (A + x2 * B + x4 * C); |

78 | } |

79 | |

80 | double sin6(double x) |

81 | { |

82 | double x2 = x * x; |

83 | double x4 = x2 * x2; |

84 | double x8 = x4 * x4; |

85 | double A = a0 + x2 * (a1 + x2 * (a2 + x2 * a3)); |

86 | double B = a4 + x2 * (a5 + x2 * (a6 + x2 * a7)); |

87 | return x * (A + x8 * B); |

88 | } |

89 | |

90 | double sin7(double x) |

91 | { |

92 | double x2 = x * x; |

93 | double x3 = x2 * x; |

94 | double x4 = x2 * x2; |

95 | double x8 = x4 * x4; |

96 | double x9 = x8 * x; |

97 | __asm__("" : "+x" (x2), "+x" (x3), "+x" (x4), "+x" (x8), "+x" (x9)); |

98 | double A = x3 * (a1 + x2 * (a2 + x2 * a3)); |

99 | double B = a4 + x2 * (a5 + x2 * (a6 + x2 * a7)); |

100 | double C = a0 * x; |

101 | return A + C + x9 * B; |

102 | } |

103 | |

104 | int main() |

105 | { |

106 | typedef chrono::high_resolution_clock clock_t; |

107 | clock_t::time_point t0, t1; |

108 | size_t const iterations = 10000000; |

109 | double const inv = 1.0 / iterations; |

110 | |

111 | double sum = 0.0; |

112 | t0 = clock_t::now(); |

113 | for (size_t run = 0; run < iterations; run++) |

114 | sum += run * inv; |

115 | t1 = clock_t::now(); |

116 | double norm = chrono::nanoseconds(t1 - t0).count() * inv; |

117 | |

118 | double sum0 = 0.0; |

119 | t0 = clock_t::now(); |

120 | for (size_t run = 0; run < iterations; run++) |

121 | sum0 += sin(run * inv); |

122 | t1 = clock_t::now(); |

123 | cout << "sin: " << chrono::nanoseconds(t1 - t0).count() * inv - norm << " ns" << endl; |

124 | |

125 | double sum1 = 0.0; |

126 | t0 = clock_t::now(); |

127 | for (size_t run = 0; run < iterations; run++) |

128 | sum1 += sin1(run * inv); |

129 | t1 = clock_t::now(); |

130 | cout << "sin1: " << chrono::nanoseconds(t1 - t0).count() * inv - norm << " ns" << endl; |

131 | |

132 | double sum2 = 0.0; |

133 | t0 = clock_t::now(); |

134 | for (size_t run = 0; run < iterations; run++) |

135 | sum2 += sin2(run * inv); |

136 | t1 = clock_t::now(); |

137 | cout << "sin2: " << chrono::nanoseconds(t1 - t0).count() * inv - norm << " ns" << endl; |

138 | |

139 | double sum3 = 0.0; |

140 | t0 = clock_t::now(); |

141 | for (size_t run = 0; run < iterations; run++) |

142 | sum3 += sin3(run * inv); |

143 | t1 = clock_t::now(); |

144 | cout << "sin3: " << chrono::nanoseconds(t1 - t0).count() * inv - norm << " ns" << endl; |

145 | |

146 | double sum4 = 0.0; |

147 | t0 = clock_t::now(); |

148 | for (size_t run = 0; run < iterations; run++) |

149 | sum4 += sin4(run * inv); |

150 | t1 = clock_t::now(); |

151 | cout << "sin4: " << chrono::nanoseconds(t1 - t0).count() * inv - norm << " ns" << endl; |

152 | |

153 | double sum5 = 0.0; |

154 | t0 = clock_t::now(); |

155 | for (size_t run = 0; run < iterations; run++) |

156 | sum5 += sin5(run * inv); |

157 | t1 = clock_t::now(); |

158 | cout << "sin5: " << chrono::nanoseconds(t1 - t0).count() * inv - norm << " ns" << endl; |

159 | |

160 | double sum6 = 0.0; |

161 | t0 = clock_t::now(); |

162 | for (size_t run = 0; run < iterations; run++) |

163 | sum6 += sin6(run * inv); |

164 | t1 = clock_t::now(); |

165 | cout << "sin6: " << chrono::nanoseconds(t1 - t0).count() * inv - norm << " ns" << endl; |

166 | |

167 | double sum7 = 0.0; |

168 | t0 = clock_t::now(); |

169 | for (size_t run = 0; run < iterations; run++) |

170 | sum7 += sin7(run * inv); |

171 | t1 = clock_t::now(); |

172 | cout << "sin7: " << chrono::nanoseconds(t1 - t0).count() * inv - norm << " ns" << endl; |

173 | |

174 | cout << setprecision(20); |

175 | cout << sum0 << endl; |

176 | cout << sum1 << endl; |

177 | cout << sum2 << endl; |

178 | cout << sum3 << endl; |

179 | cout << sum4 << endl; |

180 | cout << sum5 << endl; |

181 | cout << sum6 << endl; |

182 | cout << sum7 << endl; |

183 | |

184 | return sum + sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7 == 0.0; |

185 | } |

186 |