base大家族详解
1 简介
对于base编码的使用,参考Base What? A Practical Introduction to Base Encoding。
- base2: 2进制
- base10: 10进制
- base16: 计算机字节表示
- base32: 字母加数字
- base64: 更大的表示范围
- base58: bitcoin为了比base64更易读采用的表示
这些base编码主要是在计算机的二进制表示,和人可读的字符表示之间转换。
具体实现可以分为2种,能利用bit位直接转换的base(例如base2, base16,base32, base64),和不能利用bit位直接转换的base(例如base10, base58)。
2 实现代码
全部采用clojure实现,主要是为了理解base编码,只求代码容易理解,性能不是很好:
(ns base) (defn indices-of [f coll] (keep-indexed #(if (f %2) %1 nil) coll)) (defn first-index-of [f coll] (first (indices-of f coll))) (defn find-thing "查找`coll`中第一个等于value的索引" [value coll] (first-index-of #(= % value) coll)) (def divmod "返回[商 余数]" (juxt quot mod)) ;; 1. 不能利用bit位直接转换的base ;; 基本编码方法就是把输入作为一个数字,不断的对baseN的N进行除法运算, ;; 每次的余数就是base-table的索引,商作为下一次的除数进行计算。直到除尽 (defn base-encode-num "base编码一个数字`n` `leading-zeros`为padding个数 `base-table`为编码转换表" [n leading-zeros base-table] (let [base-count (count base-table)] (loop [n n result ""] (if (>= n base-count) (let [[d m] (divmod n base-count)] (recur d ;; 编码是从后往前进行的,每次除以baseN的N, ;; 以余数作为索引 (-> (nth base-table m) (str result)))) (->> (if (pos? n) (-> (nth base-table n) (str result)) result) ;; 注意,因为是从后往前进行生成的,padding在前面添加 (concat (repeat leading-zeros (first base-table))) (apply str)))))) (defn base-enc "编码base10,base36,base58等" [base-table s] (let [s-bytes (.getBytes s) leading-zeros (->> s-bytes (take-while zero?) count)] (-> (java.math.BigInteger. 1 s-bytes) (base-encode-num leading-zeros base-table)))) (defn invert-table "生成`table`序列的反向索引表 " [table] (into {} (map #(vector %1 %2) table (iterate inc 0)))) ;; baseN的解码方法就是对输入字符串的每个字符查找索引 * (N ** 输入串反向顺序的索引), ;; 结果累加就是目标数字,再把数字还原为字符串 (defn base-dec "解码base10,base36,base58等" [base-table s] (let [padding (->> s (take-while #(= % (first base-table))) (map (constantly (byte 0)))) inverted-table (invert-table base-table) base-count (count base-table)] (->> (reverse s) (reduce (fn [[r m] c] [(+ r (*' m (inverted-table c))) (*' m base-count)]) [0 1]) first str java.math.BigInteger. .toByteArray (drop-while zero?) (concat padding) byte-array String.))) (def base58-table "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz") (def enc-base58 (partial base-enc base58-table)) (def dec-base58 (partial base-dec base58-table)) (def base36-table "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ") (def enc-base36 (partial base-enc base36-table)) (def dec-base36 (partial base-dec base36-table)) (def base10-table "0123456789") (def enc-base10 (partial base-enc base10-table)) (def dec-base10 (partial base-dec base10-table)) (comment (def target "D9cS9N9iHjMLTdA8YSMRMp") (def r (dec-base58 target)) (= target (enc-base58 r)) (= "64021609848" (enc-base36 "testabc")) (enc-base10 "this is a test") ;; => "2361031878030638688519054699098996" (dec-base10 "2361031878030638688519054699098996") ;; => "this is a test" ) ;;; ============= 以2为基的base算法,base2, base4, base16, base32, base64 ... ;;; 可以利用bit位直接变换的base ;;; 算法把输入的8bit串转换为base(2^n)的nbit串,不需要做除法, ;;; 编码过程 直接把输入字符串转换为bit序列,再根据baseN的位宽(如base64 6位,base32 5位)重新划分, ;;; 查表获取目标串,不需要除法,因此要比上面的base算法速度快 ;;; 下面统一称为base2 (defn bits "转换数字为bit序列,如果不指定`width`,默认为8位" ([n] (bits n 8)) ([n width] (reverse (map #(bit-and (bit-shift-right n %) 1) (range width))))) (defn numb "bit序列转换为数字" [bits] (BigInteger. (apply str bits) 2)) (defn string-to-bits "字符串转换为bit序列" [msg] (->> (.getBytes msg) (map #(bits %)) flatten)) (defn padding-count "获取字节序列`v`的padding个数" [v] (->> (vec v) rseq (take-while zero?) count)) (defn drop-padding "去掉字节序列`v`的padding" [v] (-> (padding-count v) (drop-last v))) (defn log2 [n] (/ (Math/log n) (Math/log 2))) (defn get-bit-width "获取`base-table`的位宽,即用多少字节表示一个字符" [base-table] (let [width (-> (count base-table) log2)] (when (= width (Math/floor width)) (int width)))) (defn valid-base2-table? "检测是否为有效的base2表" [base-table] (-> (get-bit-width base-table) nil? not)) (defn byte-align "根据位宽bit-width,获取需要对齐的base字节数 比如一个字节是8位,转换base64,每个base64字节为6位 byte-align就是确定最低需要多少个base64的6位字节才能对齐到8位字节" [bit-width] (let [max-padding 12 fn-len-range (fn [width] (map (partial * width) (range 1 max-padding))) hex-range (fn-len-range 8) bit-range (fn-len-range bit-width) align-value (->> (clojure.set/intersection (set hex-range) (set bit-range)) (apply min))] (/ align-value bit-width))) (defn hex-byte-align "获取最低需要多少个hex字节才能对齐" [bit-width] (-> (byte-align bit-width) (* bit-width) (/ 8))) (comment (byte-align 6) ;; => 4 (hex-byte-align 6) ;; => 3 ;; base64需要4个6位字节才能对齐到3个8位hex字节,即 4 * 6 = 3 * 8 (byte-align 5) ;; => 8 (hex-byte-align 5) ;; => 5 ;; base32需要8个5位字节才能对齐到5个8位hex字节, 即 8 * 5 = 5 * 8 (hex-byte-align 4) ;; => 1 (hex-byte-align 2) ;; => 1 (hex-byte-align 1) ;; => 1 ;; 可以看到base2, base4,base16 可以对齐到1个字节,不需要padding ) (defn gen-padding "生成`data`的padding串" [bit-width data] (let [len (count data) align (byte-align bit-width) aligned-byte (mod len align)] (if (zero? aligned-byte) "" (apply str (-> (- align aligned-byte) (repeat "=")))))) (defn base2-enc "编码base2类型的字符串`s` `base-table`必须是以2为基的base表,base(2^n),例如base16,base32,base64" [base-table s] {:pre [(valid-base2-table? base-table)]} (let [bits (string-to-bits s) base2-char #(->> (numb %) (nth base-table)) bit-width (get-bit-width base-table) data (partition bit-width bit-width (repeat 0) bits)] (str (->> (map base2-char data) (apply str)) (gen-padding bit-width data)))) (defn base2-dec "base2解码,`s`为要解码的字符串 `base-table` 解码表" [base-table s] {:pre [(valid-base2-table? base-table)]} (let [bit-width (get-bit-width base-table) inverted-table (merge (invert-table base-table) {\= 0} ;; 添加padding字符 )] (->> (mapcat #(some-> (inverted-table %1) (bits bit-width)) s) ;; mapcat自动过滤掉base2-bits的nil值,即忽略s串中不属于base-table的字符 (partition 8 8 (repeat 0)) (map numb) drop-padding (map char) (apply str)))) ;; base64,base32 需要使用base2-enc dec (def base64-table "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/") (def enc-base64 (partial base2-enc base64-table)) (def dec-base64 (partial base2-dec base64-table)) (def base32-table "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567") (def enc-base32 (partial base2-enc base32-table)) (def dec-base32 (partial base2-dec base32-table)) ;; base16等同于hex转换 (def base16-table "0123456789ABCDEF") (def enc-base16 (partial base2-enc base16-table)) (def dec-base16 (partial base2-dec base16-table)) ;; base2是2进制表示 (def base2-table "01") (def enc-base2 (partial base2-enc base2-table)) (def dec-base2 (partial base2-dec base2-table)) ;; 自动定义base的宏 (defmacro defbase "根据base-table生成base编码和解码函数的定义" [base-name base-table] (let [table-name (symbol (str base-name "-table")) enc-fname (symbol (str "enc-" base-name)) dec-fname (symbol (str "dec-" base-name)) def-table `(def ~table-name ~base-table)] (if-let [bit-width (get-bit-width base-table)] `(do ~def-table (def ~enc-fname (partial base2-enc ~base-table)) (def ~dec-fname (partial base2-dec ~base-table))) `(do ~def-table (def ~enc-fname (partial base-enc ~base-table)) (def ~dec-fname (partial base-dec ~base-table)))))) ;; (defbase base64 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/") ;; (defbase base32 "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567") ;; (defbase base16 "0123456789ABCDEF") ;; (defbase base2 "01") ;; (defbase base58 "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz") ;; (defbase base36 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ") ;; (defbase base10 "0123456789") (defbase base8 "01234567") (comment base8-table ;; => "01234567" (enc-base8 "test 1") ;; => "3506256335020061" (dec-base8 *1) ;; => "test 1" (= 2 (padding-count [1 2 0 3 0 0])) (= 0 (padding-count [1 2 3])) (= 0 (padding-count [])) (= [1 2 0 3] (drop-padding [1 2 0 3 0 0])) (= "I" (base2-dec base64-table "SQ==")) (= "AM" (base2-dec base64-table "QU0=")) (base2-enc base64-table "I") ;; => "SQ==" (base2-enc base64-table "AM") ;; => "QU0=" (String. (base64/encode "I")) ;; => "SQ==" (String. (base64/encode "AM")) ;; => "QU0=" (= "this is a test" (base2-dec base32-table "ORUGS4ZANFZSAYJAORSXG5A=")) (base32/encode "this is a test") ;; => "ORUGS4ZANFZSAYJAORSXG5A=" (base2-enc base32-table "this is a test") ;; => "ORUGS4ZANFZSAYJAORSXG5A=" ;; base-table不符,产生异常 (= "base58_is_boring" (base2-dec base58-table "D9cS9N9iHjMLTdA8YSMRMp")) (= 6 (get-bit-width base64-table)) (= 5 (get-bit-width base32-table)) (= nil (get-bit-width base58-table)) (enc-base32 "I") ;; base32的padding比较多 ;; => "JE======" (dec-base32 *1) ;; => "I" ;; base16相当于内存hex表示 (enc-base16 "this is a test") ;; => "7468697320697320612074657374" (dec-base16 "7468697320697320612074657374") ;; => "this is a test" (dec-base16 "7468IIOO697320697320612074657374") ;; 不合法字符直接忽略 ;; => "this is a test" (codecs/bytes->hex (codecs/str->bytes "this is a test")) ;; => "7468697320697320612074657374" ;; base2相当于内存的二进制表示 (enc-base2 "t") ;; => "01110100" ;; \t的二进制 (Integer/toString (int \t) 2) ;; => "1110100" (dec-base2 "01110100") ;; => "t" ;; 注意解码时前面的0是必要的 (def s1 "this is a test") (= (enc-base64 s1) (String. (base64/encode s1))) (= (enc-base32 s1) (base32/encode s1)) ) ;;; ============== 自动base解码 (defn valid-base-str? "检查字符串s是否符合base-table" [base-table s] (clojure.set/subset? (set s) (-> (set base-table) (conj \=)))) (defn guess-base [text] (cond ;; 这里必须按从小到大的顺序测试 (valid-base-str? base16-table text) :base16 (valid-base-str? base32-table text) :base32 (valid-base-str? base64-table text) :base64 :else :unknown)) (defn decode "自动base解码" [text & {:keys [step debug] :or {step 1 debug false}}] (loop [text text step step] (when debug (println "step" step "decode:" text)) (case (guess-base text) :base16 (do (println "step" step "--> base16") (-> (dec-base16 text) (recur (inc step)))) :base32 (do (println "step" step "--> base32") (-> (dec-base32 text) (recur (inc step)))) :base64 (do (println "step" step "--> base64") (-> (dec-base64 text) (recur (inc step)))) :unknown (do (println "result:" text) text)))) (comment (def text "3441353234343435353735323442353634423539354134353336353333323536343735323444343535333537344234433441343634353535353535323533343734413335344134363439353534423530344135413437353634463444353334463441344534443435333235363533353534423531354134363431353334413335") ;; 经测试,直接转换bits对于长字符串速度比较慢 (decode text) ;; step 1 --> base16 ;; step 2 --> base16 ;; step 3 --> base32 ;; step 4 --> base32 ;; step 5 --> base64 ;; step 6 --> base64 ;; result: key:hi!venus ;; => "key:hi!venus" )
3 结论
base编码基本是码表转换,产生人可读的字符表示,不适合加密。
Created: 2019-04-11 四 10:28