REBOL [
	title: "Various compression algorithms"
	file: %compression.r
	author: "Marco Antoniazzi"
	email: [luce80 AT libero DOT it]
	date: 14-02-2013
	version: 0.0.3
	Purpose: "Collect and show various compression algorithms (Huffman, RLE, LZ77, LZSS, LZ78, LZW)."
	History: [
		0.0.1 [01-01-2013 "Started"]
		0.0.2 [12-02-2013 "ok"]
		0.0.3 [14-02-2013 "Fixed silly bug in lzw binary! decompress"]
	]
	library: [
		level: 'intermediate
		platform: 'all
		type: [function tool]
		domain: [compression]
		tested-under: [View 2.7.8.3.1]
		support: none
		license: 'public-domain
	]
	notes: {
		I should say that these functions are made slow on pourpose.
		Any optimization for speed or bits-saving is left as an exercise to the reader ;)
		
		Do not esitate to help me improve this script by adding more algorithms.
	}
]

compression: context [
	; some functions taken or partially derived from an article of Ole Friis in www.rebolforces.com

	alphadigits: "0123456789ABCDEF"
	enbase: func [value [integer!] /base base-value [integer!] /local result num][
		if value = 0 [return "0"]
		base-value: any [base-value 16]
		result: copy ""
		while [value <> 0][
			num: mod value base-value
			insert result any [alphadigits/(num + 1) "0"]
			value: (value - num) / base-value
		]
		result
	]
	debase: func [value [any-string!] /base base-value [integer!] /local num char pos][
		base-value: any [base-value 16]
		;FIXME: if 0 <> mod (length? string) base-value [make error! "wrong string length"]
		num: 0
		foreach char value [
			if none? pos: find alphadigits uppercase char [break]
			num: num * base-value + (index? pos) - 1
		]
		num
	]
	from-base2: func [value][debase/base value 2]
	to-base2: func [value][enbase/base value 2]
	
	
	
	bit-pos: 1
	bin-result: copy #{}
	to-bin: func [num][system/words/debase/base to-hex num 16]
	bits-set: func [num len /local shifted-num result][
		; shift num left to bit-pos
		shifted-num: shift/left to-integer num 32 - len - bit-pos + 1 
		; overwrite dest binary with shifted number
		result: change bin-result bin-result or to-bin shifted-num
		; update bit number and bin-result pos
		bit-pos: bit-pos + len
		while [bit-pos > 8] [bit-pos: bit-pos - 8 bin-result: next bin-result]
		head result
	]
	bits-get: func [len /local mask num][
		mask: shift to-integer #{80000000} len - 1
		mask: shift/logical mask bit-pos - 1
		; extract bits using mask and then shift them to get the right number
		num: mask and to-integer copy/part bin-result 4
		num: shift/logical num 32 - len - bit-pos + 1
		bit-pos: bit-pos + len
		while [bit-pos > 8] [bit-pos: bit-pos - 8 bin-result: next bin-result]
		num
	]

	encode-bitstream: func [;author Ole Friis
		"Encodes a string of 1's and 0's to a binary string"
		s [string!] "The string of 1's and 0's"
		/local res byte add-this
		][
		res: copy #{}
		forever [
			byte: 0
			add-this: 128
			while [(add-this <> 0) and (not tail? s)] [
				if #"1" = first s [byte: byte + add-this]
				add-this: shift add-this 1
				s: next s
			]
			append res to-char byte
			if tail? s [return res]
		]
	]
	decode-bitstream: func [;author Ole Friis
		"Decodes a binary string into a string of 1's and 0's"
		s [binary!] "The binary string"
		/local res next-bit
		][
		res: copy ""
		next-bit: 128
		while [not tail? s] [
			append res either 0 = and~ next-bit to-integer first s ["0"]["1"]
			next-bit: shift next-bit 1
			if next-bit = 0 [next-bit: 128  s: next s]
		]
		res
	]	



	huffman: context [

		stats: none ; The statistics
		tree: none ; the Huffman tree
		codes: none ; the translation table with alphabet symbols and their Huffman code
		huffman-code: copy "" ; temporary partial Huffman code

		build_stats: func [
			{build a block with all symbols and their count (eg. [2 #"e" 7 #"a" ...])}
			data [any-string!] "data to analyze"
			/local char pos
			][
			stats: copy []
			data: as-string data
			foreach char data [
				either pos: find/case stats char [pos/(-1): pos/(-1) + 1][insert insert stats 1 char]
			]
			
			sort/skip stats 2
		]

		build_tree: func [
			"build the Huffman tree" ; the tree is a list of lists something like: [[[a 3] [[[c 1] [b 1] 2] [e 2] 4] 7]]
			stats [block!] "The statistics"
			/local node1 node2 weight new-node temp-list
			][
			tree: copy []
			; Create a leaf node for each symbol and add it to the tree
			forskip stats 2 [repend/only tree [second stats first stats]]

			while [1 < length? tree][
				node1: take tree
				node2: take tree
				weight: (last node1) + (last node2)
				new-node: reduce [node1 node2 weight]
				; Insert the new node correctly (sorted) in the tree (we could use a bisection)
				temp-list: head tree
				while [all [not tail? temp-list  (last first temp-list) <= weight]][
				  temp-list: next temp-list
				]
				insert/only temp-list new-node
			]
			; Return the top element of the Huffman tree
			tree: first tree
		]

		build_table: func [
			"recursively traverse the huffman tree building a translation table"
			tree [block!]
			/local node
			][
			if block? node: first tree [
				append huffman-code "0"
				build_table node
			]
			either block? node: second tree [
				append huffman-code "1"
				build_table node
			][
				repend codes [first tree copy huffman-code]
			]
			remove back tail huffman-code
			make hash! codes
		]

		compress: func [
			"Huffman-compress a string series"
			data [any-string!] "Data to compress"
			/table block [block!]  "The Huffman translation table to use"
			/to-block "Give result as a block of strings"
			/local char result code block-result
			][
			result: copy ""
			block-result: copy []
			block: any [block do [codes: copy [] build_table build_tree build_stats data]]
			foreach char data [
				insert tail result code: select block to-char char ;cannot use path selection because it is case INSENSITIVE (!?)
				if to-block [insert tail block-result code]
			]
			if to-block [return block-result]
			result: encode-bitstream result
		]

		to-canonical: func [
			"Transform a Huffman translation table to a canonical one"
			/table block [block!]  "The Huffman translation table to use"
			/local codes-copy canonical-codes current-bit-length new-code
			][
			codes-copy: copy any [block codes]
			sort/skip codes-copy 2
			canonical-codes: copy []
			;The first symbol in the list gets assigned a codeword which is the same length as the symbol's original codeword but all zeros. This will often be a single zero ('0').
			current-bit-length: length? second codes-copy
			repend canonical-codes [first codes-copy head insert/dup clear "" "0" current-bit-length]

			new-code: second canonical-codes
			codes-copy: skip codes-copy 2
			forskip codes-copy 2 [
				current-bit-length: length? second codes-copy
				new-code: to-base2 (from-base2 new-code) + 1
				if current-bit-length > length? new-code [insert new-code "0"]
				if current-bit-length > length? new-code [insert tail new-code "0"]
				repend canonical-codes [first codes-copy new-code]
			]
			make hash! canonical-codes
		]

		decompress: func [
			"decompress Huffman encoded block or binary"
			data [block! binary!] "data to decompress"
			/tree tree-block [block!]  "The Huffman tree to use"
			/table block [block!]  "The Huffman translation table to use"
			/local result char temp-code node
			][
			result: copy #{}
			if block? data [data: rejoin data]
			if binary? data [data: decode-bitstream data]
			either block [
				while [not tail? data][
					temp-code: clear ""
					char: false
					while [not char][
						append temp-code first+ data
						char: find/case block temp-code
						if char [char: first back char]
					]
					append result char
				]
			][
				tree-block: any [tree-block self/tree]
				while [not tail? data][
					node: tree-block
					until [
						node: either (first+ data) = #"0" [first node] [second node]
						(length? node) = 2 ; a leaf
					]
					append result first node
				]
			]
			data: head data
			result
		]
		
	] ; huffman
	
	rle: context [

		compress: func [
			"Run length (aka PackBits) compresses a string series and returns it."
			data [any-string!] "Data to compress"
			/local start count char result
			][
			result: copy #{}
			while [not tail? data] [
				count: 0
				start: data
				char: pick data 1
				; collect a sequence of not equal characters (optionally including some pairs of equals)
				while [all [any [char <> pick data 2 char <> pick data 3] count < 127]][
					count: count + 1
					data: next data
					char: pick data 1
				]
				if count > 0 [repend result [to-char (count - 1) copy/part start count]]
				if count = 127 [data: back data]
				if tail? data [break] 
				count: 0
				start: data
				char: pick data 1
				; collect a sequence of all equal characters
				while [all [char = pick data 1 count < 128]][
					count: count + 1
					data: next data
				]
				if count > 1 [repend result [to-char (256 - count + 1) char]]
			]
			data: head data
			result
		]

		decompress: func [
			"decompress a run length (aka PackBits) compressed binary"
			data [binary!] "Data to decompress"
			/local char count result
			][
			result: copy #{}
			while [not tail? data] [
				char: first data
				case [
					char < 128 [
						count: 1 + char
						insert tail result copy/part next data count
						data: skip data count + 1
					]
					char > 128 [
						insert/dup tail result to-char second data (256 + 1 - char)
						data: skip data 2
					]
					char = 128 [
						data: skip data 1
					]
				]
			]
			data: head data
			result
		]
	] ; rle

	; utility functions for LZ77 and LZSS
		shift_window: func [look-ahead-buffer positions][
			set look-ahead-buffer skip get look-ahead-buffer positions 				
		]
		match-length: func [a b /local start][
			start: a
			while [all [a/1 = b/1 not tail? a]][a: next a b: next b]
			offset? start a
		]
		find_longest_match: func [search data search-buffer-size look-ahead-buffer-size /local pos len off length result][
			pos: data
			length: 0
			result: head insert insert clear [] 0 0
			while [pos: find/case/reverse pos first data] [
				if (off: offset? pos data) > search-buffer-size [break]
				if (len: match-length pos data) > length [
					if len > look-ahead-buffer-size [break]
					length: len
					change/part result reduce [off length] 2
				]
			]
			result
		]
	;

	lz77: context [

		result: copy []

		compress: func [
			"LZ77 compress a string series"
			data [any-string!] "data to compress"
			/to-block "return abstract block"
			/local
			look-ahead-buffer look-ahead-buffer-size search-buffer search-buffer-size position length
			emit
			][
			clear result
			look-ahead-buffer: data
			look-ahead-buffer-size: 15
			search-buffer: data
			search-buffer-size: 255 ; if you increase this you have to change the code to generate binary! accordingly

			emit: func [pos len char][insert insert insert tail result pos len char]

			while [not empty? look-ahead-buffer] [
				;go backwards in search buffer to find longest match of the look-ahead buffer
				set [position length] find_longest_match search-buffer look-ahead-buffer search-buffer-size look-ahead-buffer-size
				emit position length any [pick look-ahead-buffer length + 1 "^@"]
				shift_window 'look-ahead-buffer length + 1
			]

			probe result
			; convert to compact binary!
			unless to-block [
				bit-pos: 1 ; reset bit position
				bin-result: clear head bin-result
				while [not tail? result] [
					either 0 = first result [
						insert insert tail bin-result to-char first result third result
					][
						insert insert insert tail bin-result to-char first result to-char second result third result
					]
					result: skip result 3
				]
				return copy head bin-result
			]

			copy result
		]

		decompress: func [
			"decompress LZ77 encoded binary"
			data [block! binary!] "String to decompress"
			/local flag outstring index char out bytes
			][
			if binary? data [
				;convert back to block
				;(obviously we could convert directly to final binary but this is kind of "educational")
				bit-pos: 1 ; reset bit position
				bin-result: copy data
				out: copy []
				while [0 < length? bin-result][
					flag: first bin-result
					insert tail out flag
					either flag != 0 [
						insert tail out any [pick bin-result 2 0]
						insert tail out to-char any [pick bin-result 3 0]
						bin-result: skip bin-result 3
					][
						insert tail out 0
						insert tail out to-char second bin-result
						bin-result: skip bin-result 2
					]
				]
				probe data: out
			]

			out: make binary! length? data
			foreach [offset length symbol] data [
				;go reverse in previous output by offset characters and copy character wise for length symbols;
				out: insert out bytes: copy/part skip out negate offset length
				; special case for duplicated bytes (encoding reaches look-ahead buffer)
				if length > offset [
					out: insert/dup out bytes (to-integer length / offset) - 1
					out: insert out copy/part bytes mod length offset
				] 
				out: insert out symbol
			]
			head out
		]

	] ; lz77

	lzss: context [

		result: copy []

		compress: func [
			"LZSS compress a string series"
			data [any-string!] "data to compress"
			/to-block "return abstract block"
			/local
			look-ahead-buffer look-ahead-buffer-size search-buffer search-buffer-size minimum-match-length position length
			emit emit-char
			][
			clear result
			look-ahead-buffer: data
			look-ahead-buffer-size: 15
			search-buffer: data
			search-buffer-size: 4095
			minimum-match-length: 2
			emit: func [flag pos len][insert insert insert tail result flag pos len]
			emit-char: func [flag char][insert insert tail result flag char]

			while [not empty? look-ahead-buffer] [
				;go backwards in search buffer to find longest match of the look-ahead buffer
				set [position length] find_longest_match search-buffer look-ahead-buffer search-buffer-size look-ahead-buffer-size
				either length > minimum-match-length [
					emit 0 position length
					shift_window 'look-ahead-buffer length
				][
					emit-char 1 first look-ahead-buffer
					shift_window 'look-ahead-buffer 1
				]
			]
			
			probe result
			; convert to compact binary!
			; (note that the (un)compressed flag is reversed from standard since this simplifies decoding)
			unless to-block [
				bit-pos: 1 ; reset bit position
				bin-result: clear head bin-result
				while [not tail? result] [
					either 0 = first result [
						bits-set 0 1
						bits-set second result 12
						bits-set third result 4
						result: skip result 3
					][
						bits-set 1 1
						bits-set second result 8
						result: skip result 2
					]
				]
				return copy head bin-result
			]
			copy result
		]

		decompress: func [
			"decompress LZSS encoded binary"
			data [block! string! binary!] "data to decompress"
			/local offset length out flag bytes
			][
			if string? data [data: to-binary data]
			if binary? data [
				;convert back to block
				;(obviously we could convert directly to final binary but this is kind of "educational")
				bit-pos: 1 ; reset bit position
				bin-result: copy data
				out: copy []
				while [0 < length? bin-result][
					flag: bits-get 1
					insert tail out flag
					either flag = 0 [
						insert tail out bits-get 12
						insert tail out bits-get 4
					][
						insert tail out to-char bits-get 8
					]
				]
				data: out
			]
			probe data

			out: make binary! length? data
			while [not empty? data] [
				either 0 = first data [ 
					offset: second data
					length: third data
					;go reverse in previous output by offset characters and copy character wise for length symbols;
					out: insert out bytes: copy/part skip out negate offset length
					; special case for duplicated bytes (encoding reaches look-ahead buffer)
					if length > offset [
						out: insert/dup out bytes (to-integer length / offset) - 1
						out: insert out copy/part bytes mod length offset
					] 
					data: skip data 3
				][
					out: insert out second data ; copy uncompressed character
					data: skip data 2
				]
			]
			head out
		]

	] ; lzss
	
	lz78: context [

		result: copy []
		string-table: make hash! []

		add_table_entry: func [string][head insert tail string-table string]

		compress: func [
			"LZ78 compress a string"
			data [any-string!] "String to compress"
			/to-block "return abstract block"
			/local
			bytes byte bytes+byte index
			emit index_from_string
			][
			clear result
			clear string-table
			emit: func [code char][head insert insert tail result code char]
			index_from_string: func [string /local pos][(pos: find/case string-table string) either pos [index? pos][0]]

			bytes: ""
			foreach byte data [
				bytes+byte: join bytes byte
				either find/case string-table bytes+byte [
					bytes: bytes+byte
				][
					emit index_from_string bytes byte
					add_table_entry bytes+byte
					clear bytes
				]
			]
			emit index_from_string bytes ""

			probe result
			; convert to compact binary!
			unless to-block [
				bit-pos: 1 ; reset bit position
				bin-result: clear head bin-result
				index: 1
				forskip result 2 [
					bits-set first result to integer! 0.99999 + log-2 index ; this math calcs necessary number of bits
					bits-set second result 8
					index: index + 1
				]
				return copy head bin-result
			]

			copy result
		]

		decompress: func [
			"decompress LZ78 encoded binary"
			data [block! binary!] "data to decompress"
			/local
			outstring index char out
			emit string_from_index
			][
			if binary? data [
				;convert back to block
				;(obviously we could convert directly to final binary but this is kind of "educational")
				bit-pos: 1 ; reset bit position
				bin-result: copy data
				out: copy []
				insert tail out 0
				insert tail out to-char bits-get 8
				index: 2
				while [0 < length? bin-result][
					insert tail out bits-get to integer! 0.99999 + log-2 index
					insert tail out to-char bits-get 8
					index: index + 1
				]
				data: out
			]
			probe data

			out: make binary! length? data
			emit: func [string][insert tail out string]
			string_from_index: func [index][either index > 0 [string-table/(index)][""]]

			clear string-table
			forskip data 2 [
				;read pair of index and character from input
				index: first data
				char: second data
				outstring: join string_from_index index char
				emit outstring
				add_table_entry outstring
			]
			out
		]

	] ; lz78

	lzw: context [; encoding and decoding algorithms translated from TIFF documentation

		clearcode: 256
		endofinformation: 257
		result: copy []
		string-table: make hash! []
		bit-len: 9
		
		initialize_table: does [
			either empty? string-table [
				clear head string-table
				for n 0 255 1 [insert tail string-table to-string to-char n]
				insert tail string-table clearcode
				insert tail string-table endofinformation
			][
				string-table: head clear at string-table 259
			]
			bit-len: 9
		]

		compress: func [
			"LZW compresses a string series and returns it."
			data [any-string!] "Data to compress"
			/to-block "return abstract block"
			/local bytes byte bytes+byte out
			add_table_entry emit code_from_string
			][
			result: clear head result

			bit-pos: 1 ; reset bit position
			bin-result: clear head bin-result

			initialize_table

			add_table_entry: func [string][
				insert tail string-table string
				switch length? string-table [
					512  [bit-len: 10]
					1024 [bit-len: 11]
					2048 [bit-len: 12]
					4096 [emit clearcode initialize_table]
				]
			]
			emit: func [code][
				head insert tail result either code > 255 [code][to-char code] ;remove or comment this if you want only binary
				unless to-block [bits-set code bit-len]
			]
			code_from_string: func [string][(index? find/case string-table string) - 1] ; - 1 because index? is 1 based

			emit clearcode
			bytes: ""
			foreach byte data [
				bytes+byte: join bytes byte
				either find/case string-table bytes+byte [
					bytes: bytes+byte
				][
					emit code_from_string to-string bytes
					add_table_entry bytes+byte
					bytes: byte
				]
			] ; end of for loop
			emit code_from_string to-string bytes
			emit endofinformation

			probe result
			unless to-block [return copy head bin-result]
			copy result
		]

		decompress: func [
			"Decompress LZW compressed binary"
			data [block! binary!] "Data to decompress"
			/local code oldcode out
			add_table_entry emit is_in_table
			][
			if binary? data [
				bit-pos: 1 ; reset bit position
				bin-result: copy data
				bit-len: 9
			]

			out: make binary! length? data
			add_table_entry: func [string][
				insert tail string-table string
				switch length? string-table [
					511  [bit-len: 10]
					1023 [bit-len: 11]
					2047 [bit-len: 12]
				]
			]
			emit: func [code][insert tail out code ]
			is_in_table: func [code][if code < 256 [code: to-string to-char code] any [find/case string-table code code < length? string-table]]
			get-code: func [][
				either block? data [
					to-integer first+ data
				][
					bits-get bit-len
				]
			]

			while [(code: get-code) != endofinformation] [
				either code = clearcode [
					initialize_table
					code: get-code data
					if code = endofinformation [break]
					emit string-table/(code + 1)
				][ ; end of clearcode case
					either is_in_table code [
						emit string-table/(code + 1)
						add_table_entry join string-table/(oldcode + 1) first string-table/(code + 1)
					][
						outstring: join string-table/(oldcode + 1) first string-table/(oldcode + 1)
						emit outstring
						add_table_entry outstring
					]
				] ; end of not-clearcode case
				oldcode: code
			] ; end of while loop
			out
		]

	]; lzw

]

; comment [
do [
	string: "abracadabradadadadadadaaaaaaaa"
	print "^/Huffman compress:"
	probe string
	probe stream: compression/huffman/compress/to-block string
	probe as-string compression/huffman/decompress stream
	print "^/RLE compress:"
	probe string
	probe stream: compression/rle/compress string
	probe as-string compression/rle/decompress stream
	print "^/LZ77 compress:"
	probe string
	probe stream: compression/lz77/compress string
	probe as-string compression/lz77/decompress stream
	print "^/LZSS compress:"
	probe string
	probe stream: compression/lzss/compress string
	probe as-string compression/lzss/decompress stream
	print "^/LZ78 compress:"
	probe string
	probe stream: compression/lz78/compress string
	probe as-string compression/lz78/decompress stream
	print "^/LZW compress:"
	probe string
	probe stream: compression/lzw/compress string
	probe as-string compression/lzw/decompress stream

	halt
]