base大家族详解

时间:2022-01-30 22:51:52

base大家族详解

1 简介

对于base编码的使用,参考Base What? A Practical Introduction to Base Encoding

  • base2: 2进制
  • base10: 10进制
  • base16: 计算机字节表示
  • base32: 字母加数字
  • base64: 更大的表示范围
  • base58: bitcoin为了比base64更易读采用的表示

这些base编码主要是在计算机的二进制表示,和人可读的字符表示之间转换。

具体实现可以分为2种,能利用bit位直接转换的base(例如base2, base16,base32, base64),和不能利用bit位直接转换的base(例如base10, base58)。

2 实现代码

全部采用clojure实现,主要是为了理解base编码,只求代码容易理解,性能不是很好:

(ns base)

(defn indices-of
  [f coll]
  (keep-indexed #(if (f %2) %1 nil) coll))

(defn first-index-of
  [f coll]
  (first (indices-of f coll)))

(defn find-thing
  "查找`coll`中第一个等于value的索引"
  [value coll]
  (first-index-of #(= % value) coll))

(def divmod "返回[商 余数]" (juxt quot mod))

;; 1. 不能利用bit位直接转换的base
;; 基本编码方法就是把输入作为一个数字,不断的对baseN的N进行除法运算,
;; 每次的余数就是base-table的索引,商作为下一次的除数进行计算。直到除尽

(defn base-encode-num
  "base编码一个数字`n`
  `leading-zeros`为padding个数
  `base-table`为编码转换表"
  [n leading-zeros base-table]
  (let [base-count (count base-table)]
    (loop [n n
           result ""]
      (if (>= n base-count)
        (let [[d m] (divmod n base-count)]
          (recur d
                ;; 编码是从后往前进行的,每次除以baseN的N,
                ;; 以余数作为索引
                (-> (nth base-table m)
                    (str result))))
        (->> (if (pos? n)
               (-> (nth base-table n)
                   (str result))
               result)
             ;; 注意,因为是从后往前进行生成的,padding在前面添加
             (concat (repeat leading-zeros (first base-table)))
             (apply str))))))

(defn base-enc
  "编码base10,base36,base58等"
  [base-table s]
  (let [s-bytes (.getBytes s)
        leading-zeros (->> s-bytes
                           (take-while zero?)
                           count)]
    (-> (java.math.BigInteger. 1 s-bytes)
        (base-encode-num leading-zeros base-table))))

(defn invert-table
  "生成`table`序列的反向索引表 "
  [table]
  (into {}
        (map #(vector %1 %2)
             table
             (iterate inc 0))))

;; baseN的解码方法就是对输入字符串的每个字符查找索引 * (N ** 输入串反向顺序的索引),
;; 结果累加就是目标数字,再把数字还原为字符串

(defn base-dec
  "解码base10,base36,base58等"
  [base-table s]
  (let [padding (->> s
                     (take-while #(= % (first base-table)))
                     (map (constantly (byte 0))))
        inverted-table (invert-table base-table)
        base-count (count base-table)]
    (->> (reverse s)
         (reduce (fn [[r m] c]
                   [(+ r (*' m (inverted-table c)))
                    (*' m base-count)]) [0 1])
         first
         str
         java.math.BigInteger.
         .toByteArray
         (drop-while zero?)
         (concat padding)
         byte-array
         String.)))

(def base58-table "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz")
(def enc-base58 (partial base-enc base58-table))
(def dec-base58 (partial base-dec base58-table))

(def base36-table "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")
(def enc-base36 (partial base-enc base36-table))
(def dec-base36 (partial base-dec base36-table))

(def base10-table "0123456789")
(def enc-base10 (partial base-enc base10-table))
(def dec-base10 (partial base-dec base10-table))

(comment
  (def target "D9cS9N9iHjMLTdA8YSMRMp")

  (def r (dec-base58 target))

  (= target (enc-base58 r))

  (= "64021609848" (enc-base36 "testabc"))

  (enc-base10 "this is a test")
  ;; => "2361031878030638688519054699098996"

  (dec-base10 "2361031878030638688519054699098996")
  ;; => "this is a test"

  )

;;; ============= 以2为基的base算法,base2, base4, base16, base32, base64 ...
;;; 可以利用bit位直接变换的base
;;; 算法把输入的8bit串转换为base(2^n)的nbit串,不需要做除法,
;;; 编码过程 直接把输入字符串转换为bit序列,再根据baseN的位宽(如base64 6位,base32 5位)重新划分,
;;;  查表获取目标串,不需要除法,因此要比上面的base算法速度快
;;; 下面统一称为base2

(defn bits
  "转换数字为bit序列,如果不指定`width`,默认为8位"
  ([n] (bits n 8))
  ([n width]
        (reverse (map #(bit-and (bit-shift-right n %) 1) (range width)))))

(defn numb
  "bit序列转换为数字"
  [bits]
  (BigInteger. (apply str bits) 2))

(defn string-to-bits
  "字符串转换为bit序列"
  [msg]
  (->> (.getBytes msg)
       (map #(bits %))
       flatten))

(defn padding-count
  "获取字节序列`v`的padding个数"
  [v]
  (->> (vec v)
       rseq
       (take-while zero?)
       count))

(defn drop-padding
  "去掉字节序列`v`的padding"
  [v]
  (-> (padding-count v)
      (drop-last v)))

(defn log2
  [n]
  (/ (Math/log n)
     (Math/log 2)))

(defn get-bit-width
  "获取`base-table`的位宽,即用多少字节表示一个字符"
  [base-table]
  (let [width (-> (count base-table)
                  log2)]
    (when (= width (Math/floor width))
      (int width))))

(defn valid-base2-table?
  "检测是否为有效的base2表"
  [base-table]
  (-> (get-bit-width base-table)
      nil?
      not))

(defn byte-align
  "根据位宽bit-width,获取需要对齐的base字节数
  比如一个字节是8位,转换base64,每个base64字节为6位
  byte-align就是确定最低需要多少个base64的6位字节才能对齐到8位字节"
  [bit-width]
  (let [max-padding 12
        fn-len-range (fn [width]
                       (map (partial * width) (range 1 max-padding)))
        hex-range (fn-len-range 8)
        bit-range (fn-len-range bit-width)
        align-value (->> (clojure.set/intersection (set hex-range) (set bit-range))
                         (apply min))]
    (/ align-value bit-width)))

(defn hex-byte-align
  "获取最低需要多少个hex字节才能对齐"
  [bit-width]
  (-> (byte-align bit-width)
      (* bit-width)
      (/ 8)))

(comment

  (byte-align 6)
  ;; => 4

  (hex-byte-align 6)
  ;; => 3
  ;; base64需要4个6位字节才能对齐到3个8位hex字节,即 4 * 6 = 3 * 8

  (byte-align 5)
  ;; => 8

  (hex-byte-align 5)
  ;; => 5
  ;; base32需要8个5位字节才能对齐到5个8位hex字节, 即 8 * 5 = 5 * 8

  (hex-byte-align 4)
  ;; => 1
  (hex-byte-align 2)
  ;; => 1
  (hex-byte-align 1)
  ;; => 1
  ;; 可以看到base2, base4,base16 可以对齐到1个字节,不需要padding

  )

(defn gen-padding
  "生成`data`的padding串"
  [bit-width data]
  (let [len (count data)
        align (byte-align bit-width)
        aligned-byte (mod len align)]
    (if (zero? aligned-byte)
      ""
      (apply str
             (-> (- align aligned-byte)
                 (repeat "="))))))

(defn base2-enc
  "编码base2类型的字符串`s`
  `base-table`必须是以2为基的base表,base(2^n),例如base16,base32,base64"
  [base-table s]
  {:pre [(valid-base2-table? base-table)]}
  (let [bits (string-to-bits s)
        base2-char #(->> (numb %)
                         (nth base-table))
        bit-width (get-bit-width base-table)
        data (partition bit-width bit-width (repeat 0) bits)]
    (str (->> (map base2-char data)
              (apply str))
         (gen-padding bit-width data))))

(defn base2-dec
  "base2解码,`s`为要解码的字符串
  `base-table` 解码表"
  [base-table s]
  {:pre [(valid-base2-table? base-table)]}
  (let [bit-width (get-bit-width base-table)
        inverted-table (merge (invert-table base-table)
                              {\= 0} ;; 添加padding字符
                              )]
    (->> (mapcat #(some-> (inverted-table %1)
                          (bits bit-width))
                 s)
         ;; mapcat自动过滤掉base2-bits的nil值,即忽略s串中不属于base-table的字符
         (partition 8 8 (repeat 0))
         (map numb)
         drop-padding
         (map char)
         (apply str))))

;; base64,base32 需要使用base2-enc dec
(def base64-table "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/")
(def enc-base64 (partial base2-enc base64-table))
(def dec-base64 (partial base2-dec base64-table))

(def base32-table "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567")
(def enc-base32 (partial base2-enc base32-table))
(def dec-base32 (partial base2-dec base32-table))

;; base16等同于hex转换
(def base16-table "0123456789ABCDEF")
(def enc-base16 (partial base2-enc base16-table))
(def dec-base16 (partial base2-dec base16-table))

;; base2是2进制表示
(def base2-table "01")
(def enc-base2 (partial base2-enc base2-table))
(def dec-base2 (partial base2-dec base2-table))

;; 自动定义base的宏
(defmacro defbase
  "根据base-table生成base编码和解码函数的定义"
  [base-name base-table]
  (let [table-name (symbol (str base-name "-table"))
        enc-fname (symbol (str "enc-" base-name))
        dec-fname (symbol (str "dec-" base-name))
        def-table `(def ~table-name ~base-table)]
    (if-let [bit-width (get-bit-width base-table)]
      `(do ~def-table
           (def ~enc-fname (partial base2-enc ~base-table))
           (def ~dec-fname (partial base2-dec ~base-table)))
      `(do ~def-table
           (def ~enc-fname (partial base-enc ~base-table))
           (def ~dec-fname (partial base-dec ~base-table))))))

;; (defbase base64 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/")
;; (defbase base32 "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567")
;; (defbase base16 "0123456789ABCDEF")
;; (defbase base2 "01")
;; (defbase base58 "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz")
;; (defbase base36 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")
;; (defbase base10 "0123456789")

(defbase base8 "01234567")

(comment

  base8-table
  ;; => "01234567"

  (enc-base8 "test 1")
  ;; => "3506256335020061"

  (dec-base8 *1)
  ;; => "test 1"

  (= 2 (padding-count [1 2 0 3 0 0]))

  (= 0 (padding-count [1 2 3]))

  (= 0 (padding-count []))

  (= [1 2 0 3] (drop-padding [1 2 0 3 0 0]))

  (= "I" (base2-dec base64-table "SQ=="))

  (= "AM" (base2-dec base64-table "QU0="))

  (base2-enc base64-table "I")
  ;; => "SQ=="

  (base2-enc base64-table "AM")
  ;; => "QU0="

  (String. (base64/encode "I"))
  ;; => "SQ=="

  (String. (base64/encode "AM"))
  ;; => "QU0="

  (= "this is a test" (base2-dec base32-table "ORUGS4ZANFZSAYJAORSXG5A="))

  (base32/encode "this is a test")
  ;; => "ORUGS4ZANFZSAYJAORSXG5A="

  (base2-enc base32-table "this is a test")
  ;; => "ORUGS4ZANFZSAYJAORSXG5A="

  ;; base-table不符,产生异常
  (= "base58_is_boring" (base2-dec base58-table "D9cS9N9iHjMLTdA8YSMRMp"))

  (= 6 (get-bit-width base64-table))

  (= 5 (get-bit-width base32-table))

  (= nil (get-bit-width base58-table))

  (enc-base32 "I") ;; base32的padding比较多
  ;; => "JE======"

  (dec-base32 *1)
  ;; => "I"

  ;; base16相当于内存hex表示
  (enc-base16 "this is a test")
  ;; => "7468697320697320612074657374"

  (dec-base16 "7468697320697320612074657374")
  ;; => "this is a test"

  (dec-base16 "7468IIOO697320697320612074657374") ;; 不合法字符直接忽略
  ;; => "this is a test"

  (codecs/bytes->hex (codecs/str->bytes "this is a test"))
  ;; => "7468697320697320612074657374"

  ;; base2相当于内存的二进制表示
  (enc-base2 "t")
  ;; => "01110100"

  ;; \t的二进制
  (Integer/toString (int \t) 2)
  ;; => "1110100"

  (dec-base2 "01110100")
  ;; => "t"
  ;; 注意解码时前面的0是必要的

  (def s1 "this is a test")
  (= (enc-base64 s1) (String. (base64/encode s1)))

  (= (enc-base32 s1) (base32/encode s1))

  )

;;; ============== 自动base解码

(defn valid-base-str?
  "检查字符串s是否符合base-table"
  [base-table s]
  (clojure.set/subset? (set s)
                       (-> (set base-table)
                           (conj \=))))

(defn guess-base
  [text]
  (cond
    ;; 这里必须按从小到大的顺序测试
    (valid-base-str? base16-table text) :base16
    (valid-base-str? base32-table text) :base32
    (valid-base-str? base64-table text) :base64
    :else :unknown))

(defn decode
  "自动base解码"
  [text & {:keys [step debug]
           :or {step 1
                debug false}}]
  (loop [text text
         step step]
    (when debug
      (println "step" step "decode:" text))
    (case (guess-base text)
      :base16 (do (println "step" step "--> base16")
                  (-> (dec-base16 text)
                      (recur (inc step))))
      :base32 (do (println "step" step "--> base32")
                  (-> (dec-base32 text)
                      (recur (inc step))))
      :base64 (do (println "step" step "--> base64")
                  (-> (dec-base64 text)
                      (recur (inc step))))
      :unknown (do (println "result:" text)
                   text))))


(comment
  (def text "3441353234343435353735323442353634423539354134353336353333323536343735323444343535333537344234433441343634353535353535323533343734413335344134363439353534423530344135413437353634463444353334463441344534443435333235363533353534423531354134363431353334413335")

  ;; 经测试,直接转换bits对于长字符串速度比较慢
  (decode text)
  ;; step 1 --> base16
  ;; step 2 --> base16
  ;; step 3 --> base32
  ;; step 4 --> base32
  ;; step 5 --> base64
  ;; step 6 --> base64
  ;; result: key:hi!venus
  ;; => "key:hi!venus"

  )

3 结论

base编码基本是码表转换,产生人可读的字符表示,不适合加密。

作者: ntestoc

Created: 2019-04-11 四 10:28