mirror of
				https://codeberg.org/forgejo/forgejo.git
				synced 2025-11-04 08:21:11 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			136 lines
		
	
	
	
		
			3.6 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			136 lines
		
	
	
	
		
			3.6 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
// Package chardet ports character set detection from ICU.
 | 
						|
package chardet
 | 
						|
 | 
						|
import (
 | 
						|
	"errors"
 | 
						|
	"sort"
 | 
						|
)
 | 
						|
 | 
						|
// Result contains all the information that charset detector gives.
 | 
						|
type Result struct {
 | 
						|
	// IANA name of the detected charset.
 | 
						|
	Charset string
 | 
						|
	// IANA name of the detected language. It may be empty for some charsets.
 | 
						|
	Language string
 | 
						|
	// Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
 | 
						|
	Confidence int
 | 
						|
}
 | 
						|
 | 
						|
// Detector implements charset detection.
 | 
						|
type Detector struct {
 | 
						|
	recognizers []recognizer
 | 
						|
	stripTag    bool
 | 
						|
}
 | 
						|
 | 
						|
// List of charset recognizers
 | 
						|
var recognizers = []recognizer{
 | 
						|
	newRecognizer_utf8(),
 | 
						|
	newRecognizer_utf16be(),
 | 
						|
	newRecognizer_utf16le(),
 | 
						|
	newRecognizer_utf32be(),
 | 
						|
	newRecognizer_utf32le(),
 | 
						|
	newRecognizer_8859_1_en(),
 | 
						|
	newRecognizer_8859_1_da(),
 | 
						|
	newRecognizer_8859_1_de(),
 | 
						|
	newRecognizer_8859_1_es(),
 | 
						|
	newRecognizer_8859_1_fr(),
 | 
						|
	newRecognizer_8859_1_it(),
 | 
						|
	newRecognizer_8859_1_nl(),
 | 
						|
	newRecognizer_8859_1_no(),
 | 
						|
	newRecognizer_8859_1_pt(),
 | 
						|
	newRecognizer_8859_1_sv(),
 | 
						|
	newRecognizer_8859_2_cs(),
 | 
						|
	newRecognizer_8859_2_hu(),
 | 
						|
	newRecognizer_8859_2_pl(),
 | 
						|
	newRecognizer_8859_2_ro(),
 | 
						|
	newRecognizer_8859_5_ru(),
 | 
						|
	newRecognizer_8859_6_ar(),
 | 
						|
	newRecognizer_8859_7_el(),
 | 
						|
	newRecognizer_8859_8_I_he(),
 | 
						|
	newRecognizer_8859_8_he(),
 | 
						|
	newRecognizer_windows_1251(),
 | 
						|
	newRecognizer_windows_1256(),
 | 
						|
	newRecognizer_KOI8_R(),
 | 
						|
	newRecognizer_8859_9_tr(),
 | 
						|
 | 
						|
	newRecognizer_sjis(),
 | 
						|
	newRecognizer_gb_18030(),
 | 
						|
	newRecognizer_euc_jp(),
 | 
						|
	newRecognizer_euc_kr(),
 | 
						|
	newRecognizer_big5(),
 | 
						|
 | 
						|
	newRecognizer_2022JP(),
 | 
						|
	newRecognizer_2022KR(),
 | 
						|
	newRecognizer_2022CN(),
 | 
						|
 | 
						|
	newRecognizer_IBM424_he_rtl(),
 | 
						|
	newRecognizer_IBM424_he_ltr(),
 | 
						|
	newRecognizer_IBM420_ar_rtl(),
 | 
						|
	newRecognizer_IBM420_ar_ltr(),
 | 
						|
}
 | 
						|
 | 
						|
// NewTextDetector creates a Detector for plain text.
 | 
						|
func NewTextDetector() *Detector {
 | 
						|
	return &Detector{recognizers, false}
 | 
						|
}
 | 
						|
 | 
						|
// NewHtmlDetector creates a Detector for Html.
 | 
						|
func NewHtmlDetector() *Detector {
 | 
						|
	return &Detector{recognizers, true}
 | 
						|
}
 | 
						|
 | 
						|
var (
 | 
						|
	NotDetectedError = errors.New("Charset not detected.")
 | 
						|
)
 | 
						|
 | 
						|
// DetectBest returns the Result with highest Confidence.
 | 
						|
func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
 | 
						|
	var all []Result
 | 
						|
	if all, err = d.DetectAll(b); err == nil {
 | 
						|
		r = &all[0]
 | 
						|
	}
 | 
						|
	return
 | 
						|
}
 | 
						|
 | 
						|
// DetectAll returns all Results which have non-zero Confidence. The Results are sorted by Confidence in descending order.
 | 
						|
func (d *Detector) DetectAll(b []byte) ([]Result, error) {
 | 
						|
	input := newRecognizerInput(b, d.stripTag)
 | 
						|
	outputChan := make(chan recognizerOutput)
 | 
						|
	for _, r := range d.recognizers {
 | 
						|
		go matchHelper(r, input, outputChan)
 | 
						|
	}
 | 
						|
	outputs := make([]recognizerOutput, 0, len(d.recognizers))
 | 
						|
	for i := 0; i < len(d.recognizers); i++ {
 | 
						|
		o := <-outputChan
 | 
						|
		if o.Confidence > 0 {
 | 
						|
			outputs = append(outputs, o)
 | 
						|
		}
 | 
						|
	}
 | 
						|
	if len(outputs) == 0 {
 | 
						|
		return nil, NotDetectedError
 | 
						|
	}
 | 
						|
 | 
						|
	sort.Sort(recognizerOutputs(outputs))
 | 
						|
	dedupOutputs := make([]Result, 0, len(outputs))
 | 
						|
	foundCharsets := make(map[string]struct{}, len(outputs))
 | 
						|
	for _, o := range outputs {
 | 
						|
		if _, found := foundCharsets[o.Charset]; !found {
 | 
						|
			dedupOutputs = append(dedupOutputs, Result(o))
 | 
						|
			foundCharsets[o.Charset] = struct{}{}
 | 
						|
		}
 | 
						|
	}
 | 
						|
	if len(dedupOutputs) == 0 {
 | 
						|
		return nil, NotDetectedError
 | 
						|
	}
 | 
						|
	return dedupOutputs, nil
 | 
						|
}
 | 
						|
 | 
						|
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
 | 
						|
	outputChan <- r.Match(input)
 | 
						|
}
 | 
						|
 | 
						|
type recognizerOutputs []recognizerOutput
 | 
						|
 | 
						|
func (r recognizerOutputs) Len() int           { return len(r) }
 | 
						|
func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
 | 
						|
func (r recognizerOutputs) Swap(i, j int)      { r[i], r[j] = r[j], r[i] }
 |