mirror of
				https://codeberg.org/forgejo/forgejo.git
				synced 2025-10-30 22:11:07 +00:00 
			
		
		
		
	Upgrade blevesearch dependency to v2.0.1 (#14346)
* Upgrade blevesearch dependency to v2.0.1 * Update rupture to v1.0.0 * Fix test
This commit is contained in:
		
					parent
					
						
							
								3aa53dc6bc
							
						
					
				
			
			
				commit
				
					
						f5abe2f563
					
				
			
		
					 459 changed files with 7518 additions and 4211 deletions
				
			
		
							
								
								
									
										830
									
								
								vendor/github.com/blevesearch/zapx/v12/new.go
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										830
									
								
								vendor/github.com/blevesearch/zapx/v12/new.go
									
										
									
										generated
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,830 @@ | |||
| //  Copyright (c) 2018 Couchbase, Inc. | ||||
| // | ||||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| // you may not use this file except in compliance with the License. | ||||
| // You may obtain a copy of the License at | ||||
| // | ||||
| // 		http://www.apache.org/licenses/LICENSE-2.0 | ||||
| // | ||||
| // Unless required by applicable law or agreed to in writing, software | ||||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| // See the License for the specific language governing permissions and | ||||
| // limitations under the License. | ||||
| 
 | ||||
| package zap | ||||
| 
 | ||||
| import ( | ||||
| 	"bytes" | ||||
| 	"encoding/binary" | ||||
| 	"math" | ||||
| 	"sort" | ||||
| 	"sync" | ||||
| 
 | ||||
| 	"github.com/RoaringBitmap/roaring" | ||||
| 	index "github.com/blevesearch/bleve_index_api" | ||||
| 	segment "github.com/blevesearch/scorch_segment_api" | ||||
| 	"github.com/couchbase/vellum" | ||||
| 	"github.com/golang/snappy" | ||||
| ) | ||||
| 
 | ||||
| var NewSegmentBufferNumResultsBump int = 100 | ||||
| var NewSegmentBufferNumResultsFactor float64 = 1.0 | ||||
| var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0 | ||||
| 
 | ||||
| // ValidateDocFields can be set by applications to perform additional checks | ||||
| // on fields in a document being added to a new segment, by default it does | ||||
| // nothing. | ||||
| // This API is experimental and may be removed at any time. | ||||
| var ValidateDocFields = func(field index.Field) error { | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| // New creates an in-memory zap-encoded SegmentBase from a set of Documents | ||||
| func (z *ZapPlugin) New(results []index.Document) ( | ||||
| 	segment.Segment, uint64, error) { | ||||
| 	return z.newWithChunkMode(results, DefaultChunkMode) | ||||
| } | ||||
| 
 | ||||
| func (*ZapPlugin) newWithChunkMode(results []index.Document, | ||||
| 	chunkMode uint32) (segment.Segment, uint64, error) { | ||||
| 	s := interimPool.Get().(*interim) | ||||
| 
 | ||||
| 	var br bytes.Buffer | ||||
| 	if s.lastNumDocs > 0 { | ||||
| 		// use previous results to initialize the buf with an estimate | ||||
| 		// size, but note that the interim instance comes from a | ||||
| 		// global interimPool, so multiple scorch instances indexing | ||||
| 		// different docs can lead to low quality estimates | ||||
| 		estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) * | ||||
| 			NewSegmentBufferNumResultsFactor) | ||||
| 		estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) * | ||||
| 			NewSegmentBufferAvgBytesPerDocFactor) | ||||
| 		br.Grow(estimateAvgBytesPerDoc * estimateNumResults) | ||||
| 	} | ||||
| 
 | ||||
| 	s.results = results | ||||
| 	s.chunkMode = chunkMode | ||||
| 	s.w = NewCountHashWriter(&br) | ||||
| 
 | ||||
| 	storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, | ||||
| 		err := s.convert() | ||||
| 	if err != nil { | ||||
| 		return nil, uint64(0), err | ||||
| 	} | ||||
| 
 | ||||
| 	sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkMode, | ||||
| 		s.FieldsMap, s.FieldsInv, uint64(len(results)), | ||||
| 		storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) | ||||
| 
 | ||||
| 	if err == nil && s.reset() == nil { | ||||
| 		s.lastNumDocs = len(results) | ||||
| 		s.lastOutSize = len(br.Bytes()) | ||||
| 		interimPool.Put(s) | ||||
| 	} | ||||
| 
 | ||||
| 	return sb, uint64(len(br.Bytes())), err | ||||
| } | ||||
| 
 | ||||
| var interimPool = sync.Pool{New: func() interface{} { return &interim{} }} | ||||
| 
 | ||||
| // interim holds temporary working data used while converting from | ||||
| // analysis results to a zap-encoded segment | ||||
| type interim struct { | ||||
| 	results []index.Document | ||||
| 
 | ||||
| 	chunkMode uint32 | ||||
| 
 | ||||
| 	w *CountHashWriter | ||||
| 
 | ||||
| 	// FieldsMap adds 1 to field id to avoid zero value issues | ||||
| 	//  name -> field id + 1 | ||||
| 	FieldsMap map[string]uint16 | ||||
| 
 | ||||
| 	// FieldsInv is the inverse of FieldsMap | ||||
| 	//  field id -> name | ||||
| 	FieldsInv []string | ||||
| 
 | ||||
| 	// Term dictionaries for each field | ||||
| 	//  field id -> term -> postings list id + 1 | ||||
| 	Dicts []map[string]uint64 | ||||
| 
 | ||||
| 	// Terms for each field, where terms are sorted ascending | ||||
| 	//  field id -> []term | ||||
| 	DictKeys [][]string | ||||
| 
 | ||||
| 	// Fields whose IncludeDocValues is true | ||||
| 	//  field id -> bool | ||||
| 	IncludeDocValues []bool | ||||
| 
 | ||||
| 	// postings id -> bitmap of docNums | ||||
| 	Postings []*roaring.Bitmap | ||||
| 
 | ||||
| 	// postings id -> freq/norm's, one for each docNum in postings | ||||
| 	FreqNorms        [][]interimFreqNorm | ||||
| 	freqNormsBacking []interimFreqNorm | ||||
| 
 | ||||
| 	// postings id -> locs, one for each freq | ||||
| 	Locs        [][]interimLoc | ||||
| 	locsBacking []interimLoc | ||||
| 
 | ||||
| 	numTermsPerPostingsList []int // key is postings list id | ||||
| 	numLocsPerPostingsList  []int // key is postings list id | ||||
| 
 | ||||
| 	builder    *vellum.Builder | ||||
| 	builderBuf bytes.Buffer | ||||
| 
 | ||||
| 	metaBuf bytes.Buffer | ||||
| 
 | ||||
| 	tmp0 []byte | ||||
| 	tmp1 []byte | ||||
| 
 | ||||
| 	lastNumDocs int | ||||
| 	lastOutSize int | ||||
| } | ||||
| 
 | ||||
| func (s *interim) reset() (err error) { | ||||
| 	s.results = nil | ||||
| 	s.chunkMode = 0 | ||||
| 	s.w = nil | ||||
| 	s.FieldsMap = nil | ||||
| 	s.FieldsInv = nil | ||||
| 	for i := range s.Dicts { | ||||
| 		s.Dicts[i] = nil | ||||
| 	} | ||||
| 	s.Dicts = s.Dicts[:0] | ||||
| 	for i := range s.DictKeys { | ||||
| 		s.DictKeys[i] = s.DictKeys[i][:0] | ||||
| 	} | ||||
| 	s.DictKeys = s.DictKeys[:0] | ||||
| 	for i := range s.IncludeDocValues { | ||||
| 		s.IncludeDocValues[i] = false | ||||
| 	} | ||||
| 	s.IncludeDocValues = s.IncludeDocValues[:0] | ||||
| 	for _, idn := range s.Postings { | ||||
| 		idn.Clear() | ||||
| 	} | ||||
| 	s.Postings = s.Postings[:0] | ||||
| 	s.FreqNorms = s.FreqNorms[:0] | ||||
| 	for i := range s.freqNormsBacking { | ||||
| 		s.freqNormsBacking[i] = interimFreqNorm{} | ||||
| 	} | ||||
| 	s.freqNormsBacking = s.freqNormsBacking[:0] | ||||
| 	s.Locs = s.Locs[:0] | ||||
| 	for i := range s.locsBacking { | ||||
| 		s.locsBacking[i] = interimLoc{} | ||||
| 	} | ||||
| 	s.locsBacking = s.locsBacking[:0] | ||||
| 	s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0] | ||||
| 	s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0] | ||||
| 	s.builderBuf.Reset() | ||||
| 	if s.builder != nil { | ||||
| 		err = s.builder.Reset(&s.builderBuf) | ||||
| 	} | ||||
| 	s.metaBuf.Reset() | ||||
| 	s.tmp0 = s.tmp0[:0] | ||||
| 	s.tmp1 = s.tmp1[:0] | ||||
| 	s.lastNumDocs = 0 | ||||
| 	s.lastOutSize = 0 | ||||
| 
 | ||||
| 	return err | ||||
| } | ||||
| 
 | ||||
| func (s *interim) grabBuf(size int) []byte { | ||||
| 	buf := s.tmp0 | ||||
| 	if cap(buf) < size { | ||||
| 		buf = make([]byte, size) | ||||
| 		s.tmp0 = buf | ||||
| 	} | ||||
| 	return buf[0:size] | ||||
| } | ||||
| 
 | ||||
| type interimStoredField struct { | ||||
| 	vals      [][]byte | ||||
| 	typs      []byte | ||||
| 	arrayposs [][]uint64 // array positions | ||||
| } | ||||
| 
 | ||||
| type interimFreqNorm struct { | ||||
| 	freq    uint64 | ||||
| 	norm    float32 | ||||
| 	numLocs int | ||||
| } | ||||
| 
 | ||||
| type interimLoc struct { | ||||
| 	fieldID   uint16 | ||||
| 	pos       uint64 | ||||
| 	start     uint64 | ||||
| 	end       uint64 | ||||
| 	arrayposs []uint64 | ||||
| } | ||||
| 
 | ||||
| func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) { | ||||
| 	s.FieldsMap = map[string]uint16{} | ||||
| 
 | ||||
| 	s.getOrDefineField("_id") // _id field is fieldID 0 | ||||
| 
 | ||||
| 	for _, result := range s.results { | ||||
| 		result.VisitComposite(func(field index.CompositeField) { | ||||
| 			s.getOrDefineField(field.Name()) | ||||
| 		}) | ||||
| 		result.VisitFields(func(field index.Field) { | ||||
| 			s.getOrDefineField(field.Name()) | ||||
| 		}) | ||||
| 	} | ||||
| 
 | ||||
| 	sort.Strings(s.FieldsInv[1:]) // keep _id as first field | ||||
| 
 | ||||
| 	for fieldID, fieldName := range s.FieldsInv { | ||||
| 		s.FieldsMap[fieldName] = uint16(fieldID + 1) | ||||
| 	} | ||||
| 
 | ||||
| 	if cap(s.IncludeDocValues) >= len(s.FieldsInv) { | ||||
| 		s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)] | ||||
| 	} else { | ||||
| 		s.IncludeDocValues = make([]bool, len(s.FieldsInv)) | ||||
| 	} | ||||
| 
 | ||||
| 	s.prepareDicts() | ||||
| 
 | ||||
| 	for _, dict := range s.DictKeys { | ||||
| 		sort.Strings(dict) | ||||
| 	} | ||||
| 
 | ||||
| 	s.processDocuments() | ||||
| 
 | ||||
| 	storedIndexOffset, err := s.writeStoredFields() | ||||
| 	if err != nil { | ||||
| 		return 0, 0, 0, nil, err | ||||
| 	} | ||||
| 
 | ||||
| 	var fdvIndexOffset uint64 | ||||
| 	var dictOffsets []uint64 | ||||
| 
 | ||||
| 	if len(s.results) > 0 { | ||||
| 		fdvIndexOffset, dictOffsets, err = s.writeDicts() | ||||
| 		if err != nil { | ||||
| 			return 0, 0, 0, nil, err | ||||
| 		} | ||||
| 	} else { | ||||
| 		dictOffsets = make([]uint64, len(s.FieldsInv)) | ||||
| 	} | ||||
| 
 | ||||
| 	fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets) | ||||
| 	if err != nil { | ||||
| 		return 0, 0, 0, nil, err | ||||
| 	} | ||||
| 
 | ||||
| 	return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil | ||||
| } | ||||
| 
 | ||||
| func (s *interim) getOrDefineField(fieldName string) int { | ||||
| 	fieldIDPlus1, exists := s.FieldsMap[fieldName] | ||||
| 	if !exists { | ||||
| 		fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) | ||||
| 		s.FieldsMap[fieldName] = fieldIDPlus1 | ||||
| 		s.FieldsInv = append(s.FieldsInv, fieldName) | ||||
| 
 | ||||
| 		s.Dicts = append(s.Dicts, make(map[string]uint64)) | ||||
| 
 | ||||
| 		n := len(s.DictKeys) | ||||
| 		if n < cap(s.DictKeys) { | ||||
| 			s.DictKeys = s.DictKeys[:n+1] | ||||
| 			s.DictKeys[n] = s.DictKeys[n][:0] | ||||
| 		} else { | ||||
| 			s.DictKeys = append(s.DictKeys, []string(nil)) | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return int(fieldIDPlus1 - 1) | ||||
| } | ||||
| 
 | ||||
| // fill Dicts and DictKeys from analysis results | ||||
| func (s *interim) prepareDicts() { | ||||
| 	var pidNext int | ||||
| 
 | ||||
| 	var totTFs int | ||||
| 	var totLocs int | ||||
| 
 | ||||
| 	visitField := func(field index.Field) { | ||||
| 		fieldID := uint16(s.getOrDefineField(field.Name())) | ||||
| 
 | ||||
| 		dict := s.Dicts[fieldID] | ||||
| 		dictKeys := s.DictKeys[fieldID] | ||||
| 
 | ||||
| 		tfs := field.AnalyzedTokenFrequencies() | ||||
| 		for term, tf := range tfs { | ||||
| 			pidPlus1, exists := dict[term] | ||||
| 			if !exists { | ||||
| 				pidNext++ | ||||
| 				pidPlus1 = uint64(pidNext) | ||||
| 
 | ||||
| 				dict[term] = pidPlus1 | ||||
| 				dictKeys = append(dictKeys, term) | ||||
| 
 | ||||
| 				s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0) | ||||
| 				s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0) | ||||
| 			} | ||||
| 
 | ||||
| 			pid := pidPlus1 - 1 | ||||
| 
 | ||||
| 			s.numTermsPerPostingsList[pid] += 1 | ||||
| 			s.numLocsPerPostingsList[pid] += len(tf.Locations) | ||||
| 
 | ||||
| 			totLocs += len(tf.Locations) | ||||
| 		} | ||||
| 
 | ||||
| 		totTFs += len(tfs) | ||||
| 
 | ||||
| 		s.DictKeys[fieldID] = dictKeys | ||||
| 	} | ||||
| 
 | ||||
| 	for _, result := range s.results { | ||||
| 		// walk each composite field | ||||
| 		result.VisitComposite(func(field index.CompositeField) { | ||||
| 			visitField(field) | ||||
| 		}) | ||||
| 
 | ||||
| 		// walk each field | ||||
| 		result.VisitFields(visitField) | ||||
| 	} | ||||
| 
 | ||||
| 	numPostingsLists := pidNext | ||||
| 
 | ||||
| 	if cap(s.Postings) >= numPostingsLists { | ||||
| 		s.Postings = s.Postings[:numPostingsLists] | ||||
| 	} else { | ||||
| 		postings := make([]*roaring.Bitmap, numPostingsLists) | ||||
| 		copy(postings, s.Postings[:cap(s.Postings)]) | ||||
| 		for i := 0; i < numPostingsLists; i++ { | ||||
| 			if postings[i] == nil { | ||||
| 				postings[i] = roaring.New() | ||||
| 			} | ||||
| 		} | ||||
| 		s.Postings = postings | ||||
| 	} | ||||
| 
 | ||||
| 	if cap(s.FreqNorms) >= numPostingsLists { | ||||
| 		s.FreqNorms = s.FreqNorms[:numPostingsLists] | ||||
| 	} else { | ||||
| 		s.FreqNorms = make([][]interimFreqNorm, numPostingsLists) | ||||
| 	} | ||||
| 
 | ||||
| 	if cap(s.freqNormsBacking) >= totTFs { | ||||
| 		s.freqNormsBacking = s.freqNormsBacking[:totTFs] | ||||
| 	} else { | ||||
| 		s.freqNormsBacking = make([]interimFreqNorm, totTFs) | ||||
| 	} | ||||
| 
 | ||||
| 	freqNormsBacking := s.freqNormsBacking | ||||
| 	for pid, numTerms := range s.numTermsPerPostingsList { | ||||
| 		s.FreqNorms[pid] = freqNormsBacking[0:0] | ||||
| 		freqNormsBacking = freqNormsBacking[numTerms:] | ||||
| 	} | ||||
| 
 | ||||
| 	if cap(s.Locs) >= numPostingsLists { | ||||
| 		s.Locs = s.Locs[:numPostingsLists] | ||||
| 	} else { | ||||
| 		s.Locs = make([][]interimLoc, numPostingsLists) | ||||
| 	} | ||||
| 
 | ||||
| 	if cap(s.locsBacking) >= totLocs { | ||||
| 		s.locsBacking = s.locsBacking[:totLocs] | ||||
| 	} else { | ||||
| 		s.locsBacking = make([]interimLoc, totLocs) | ||||
| 	} | ||||
| 
 | ||||
| 	locsBacking := s.locsBacking | ||||
| 	for pid, numLocs := range s.numLocsPerPostingsList { | ||||
| 		s.Locs[pid] = locsBacking[0:0] | ||||
| 		locsBacking = locsBacking[numLocs:] | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| func (s *interim) processDocuments() { | ||||
| 	numFields := len(s.FieldsInv) | ||||
| 	reuseFieldLens := make([]int, numFields) | ||||
| 	reuseFieldTFs := make([]index.TokenFrequencies, numFields) | ||||
| 
 | ||||
| 	for docNum, result := range s.results { | ||||
| 		for i := 0; i < numFields; i++ { // clear these for reuse | ||||
| 			reuseFieldLens[i] = 0 | ||||
| 			reuseFieldTFs[i] = nil | ||||
| 		} | ||||
| 
 | ||||
| 		s.processDocument(uint64(docNum), result, | ||||
| 			reuseFieldLens, reuseFieldTFs) | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| func (s *interim) processDocument(docNum uint64, | ||||
| 	result index.Document, | ||||
| 	fieldLens []int, fieldTFs []index.TokenFrequencies) { | ||||
| 	visitField := func(field index.Field) { | ||||
| 		fieldID := uint16(s.getOrDefineField(field.Name())) | ||||
| 		fieldLens[fieldID] += field.AnalyzedLength() | ||||
| 
 | ||||
| 		existingFreqs := fieldTFs[fieldID] | ||||
| 		if existingFreqs != nil { | ||||
| 			existingFreqs.MergeAll(field.Name(), field.AnalyzedTokenFrequencies()) | ||||
| 		} else { | ||||
| 			fieldTFs[fieldID] = field.AnalyzedTokenFrequencies() | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	// walk each composite field | ||||
| 	result.VisitComposite(func(field index.CompositeField) { | ||||
| 		visitField(field) | ||||
| 	}) | ||||
| 
 | ||||
| 	// walk each field | ||||
| 	result.VisitFields(visitField) | ||||
| 
 | ||||
| 	// now that it's been rolled up into fieldTFs, walk that | ||||
| 	for fieldID, tfs := range fieldTFs { | ||||
| 		dict := s.Dicts[fieldID] | ||||
| 		norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID]))) | ||||
| 
 | ||||
| 		for term, tf := range tfs { | ||||
| 			pid := dict[term] - 1 | ||||
| 			bs := s.Postings[pid] | ||||
| 			bs.Add(uint32(docNum)) | ||||
| 
 | ||||
| 			s.FreqNorms[pid] = append(s.FreqNorms[pid], | ||||
| 				interimFreqNorm{ | ||||
| 					freq:    uint64(tf.Frequency()), | ||||
| 					norm:    norm, | ||||
| 					numLocs: len(tf.Locations), | ||||
| 				}) | ||||
| 
 | ||||
| 			if len(tf.Locations) > 0 { | ||||
| 				locs := s.Locs[pid] | ||||
| 
 | ||||
| 				for _, loc := range tf.Locations { | ||||
| 					var locf = uint16(fieldID) | ||||
| 					if loc.Field != "" { | ||||
| 						locf = uint16(s.getOrDefineField(loc.Field)) | ||||
| 					} | ||||
| 					var arrayposs []uint64 | ||||
| 					if len(loc.ArrayPositions) > 0 { | ||||
| 						arrayposs = loc.ArrayPositions | ||||
| 					} | ||||
| 					locs = append(locs, interimLoc{ | ||||
| 						fieldID:   locf, | ||||
| 						pos:       uint64(loc.Position), | ||||
| 						start:     uint64(loc.Start), | ||||
| 						end:       uint64(loc.End), | ||||
| 						arrayposs: arrayposs, | ||||
| 					}) | ||||
| 				} | ||||
| 
 | ||||
| 				s.Locs[pid] = locs | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| func (s *interim) writeStoredFields() ( | ||||
| 	storedIndexOffset uint64, err error) { | ||||
| 	varBuf := make([]byte, binary.MaxVarintLen64) | ||||
| 	metaEncode := func(val uint64) (int, error) { | ||||
| 		wb := binary.PutUvarint(varBuf, val) | ||||
| 		return s.metaBuf.Write(varBuf[:wb]) | ||||
| 	} | ||||
| 
 | ||||
| 	data, compressed := s.tmp0[:0], s.tmp1[:0] | ||||
| 	defer func() { s.tmp0, s.tmp1 = data, compressed }() | ||||
| 
 | ||||
| 	// keyed by docNum | ||||
| 	docStoredOffsets := make([]uint64, len(s.results)) | ||||
| 
 | ||||
| 	// keyed by fieldID, for the current doc in the loop | ||||
| 	docStoredFields := map[uint16]interimStoredField{} | ||||
| 
 | ||||
| 	for docNum, result := range s.results { | ||||
| 		for fieldID := range docStoredFields { // reset for next doc | ||||
| 			delete(docStoredFields, fieldID) | ||||
| 		} | ||||
| 
 | ||||
| 		var validationErr error | ||||
| 		result.VisitFields(func(field index.Field) { | ||||
| 			fieldID := uint16(s.getOrDefineField(field.Name())) | ||||
| 
 | ||||
| 			if field.Options().IsStored() { | ||||
| 				isf := docStoredFields[fieldID] | ||||
| 				isf.vals = append(isf.vals, field.Value()) | ||||
| 				isf.typs = append(isf.typs, field.EncodedFieldType()) | ||||
| 				isf.arrayposs = append(isf.arrayposs, field.ArrayPositions()) | ||||
| 				docStoredFields[fieldID] = isf | ||||
| 			} | ||||
| 
 | ||||
| 			if field.Options().IncludeDocValues() { | ||||
| 				s.IncludeDocValues[fieldID] = true | ||||
| 			} | ||||
| 
 | ||||
| 			err := ValidateDocFields(field) | ||||
| 			if err != nil && validationErr == nil { | ||||
| 				validationErr = err | ||||
| 			} | ||||
| 		}) | ||||
| 		if validationErr != nil { | ||||
| 			return 0, validationErr | ||||
| 		} | ||||
| 
 | ||||
| 		var curr int | ||||
| 
 | ||||
| 		s.metaBuf.Reset() | ||||
| 		data = data[:0] | ||||
| 
 | ||||
| 		// _id field special case optimizes ExternalID() lookups | ||||
| 		idFieldVal := docStoredFields[uint16(0)].vals[0] | ||||
| 		_, err = metaEncode(uint64(len(idFieldVal))) | ||||
| 		if err != nil { | ||||
| 			return 0, err | ||||
| 		} | ||||
| 
 | ||||
| 		// handle non-"_id" fields | ||||
| 		for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ { | ||||
| 			isf, exists := docStoredFields[uint16(fieldID)] | ||||
| 			if exists { | ||||
| 				curr, data, err = persistStoredFieldValues( | ||||
| 					fieldID, isf.vals, isf.typs, isf.arrayposs, | ||||
| 					curr, metaEncode, data) | ||||
| 				if err != nil { | ||||
| 					return 0, err | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		metaBytes := s.metaBuf.Bytes() | ||||
| 
 | ||||
| 		compressed = snappy.Encode(compressed[:cap(compressed)], data) | ||||
| 
 | ||||
| 		docStoredOffsets[docNum] = uint64(s.w.Count()) | ||||
| 
 | ||||
| 		_, err := writeUvarints(s.w, | ||||
| 			uint64(len(metaBytes)), | ||||
| 			uint64(len(idFieldVal)+len(compressed))) | ||||
| 		if err != nil { | ||||
| 			return 0, err | ||||
| 		} | ||||
| 
 | ||||
| 		_, err = s.w.Write(metaBytes) | ||||
| 		if err != nil { | ||||
| 			return 0, err | ||||
| 		} | ||||
| 
 | ||||
| 		_, err = s.w.Write(idFieldVal) | ||||
| 		if err != nil { | ||||
| 			return 0, err | ||||
| 		} | ||||
| 
 | ||||
| 		_, err = s.w.Write(compressed) | ||||
| 		if err != nil { | ||||
| 			return 0, err | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	storedIndexOffset = uint64(s.w.Count()) | ||||
| 
 | ||||
| 	for _, docStoredOffset := range docStoredOffsets { | ||||
| 		err = binary.Write(s.w, binary.BigEndian, docStoredOffset) | ||||
| 		if err != nil { | ||||
| 			return 0, err | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return storedIndexOffset, nil | ||||
| } | ||||
| 
 | ||||
| func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) { | ||||
| 	dictOffsets = make([]uint64, len(s.FieldsInv)) | ||||
| 
 | ||||
| 	fdvOffsetsStart := make([]uint64, len(s.FieldsInv)) | ||||
| 	fdvOffsetsEnd := make([]uint64, len(s.FieldsInv)) | ||||
| 
 | ||||
| 	buf := s.grabBuf(binary.MaxVarintLen64) | ||||
| 
 | ||||
| 	// these int coders are initialized with chunk size 1024 | ||||
| 	// however this will be reset to the correct chunk size | ||||
| 	// while processing each individual field-term section | ||||
| 	tfEncoder := newChunkedIntCoder(1024, uint64(len(s.results)-1)) | ||||
| 	locEncoder := newChunkedIntCoder(1024, uint64(len(s.results)-1)) | ||||
| 
 | ||||
| 	var docTermMap [][]byte | ||||
| 
 | ||||
| 	if s.builder == nil { | ||||
| 		s.builder, err = vellum.New(&s.builderBuf, nil) | ||||
| 		if err != nil { | ||||
| 			return 0, nil, err | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	for fieldID, terms := range s.DictKeys { | ||||
| 		if cap(docTermMap) < len(s.results) { | ||||
| 			docTermMap = make([][]byte, len(s.results)) | ||||
| 		} else { | ||||
| 			docTermMap = docTermMap[0:len(s.results)] | ||||
| 			for docNum := range docTermMap { // reset the docTermMap | ||||
| 				docTermMap[docNum] = docTermMap[docNum][:0] | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		dict := s.Dicts[fieldID] | ||||
| 
 | ||||
| 		for _, term := range terms { // terms are already sorted | ||||
| 			pid := dict[term] - 1 | ||||
| 
 | ||||
| 			postingsBS := s.Postings[pid] | ||||
| 
 | ||||
| 			freqNorms := s.FreqNorms[pid] | ||||
| 			freqNormOffset := 0 | ||||
| 
 | ||||
| 			locs := s.Locs[pid] | ||||
| 			locOffset := 0 | ||||
| 
 | ||||
| 			chunkSize, err := getChunkSize(s.chunkMode, postingsBS.GetCardinality(), uint64(len(s.results))) | ||||
| 			if err != nil { | ||||
| 				return 0, nil, err | ||||
| 			} | ||||
| 			tfEncoder.SetChunkSize(chunkSize, uint64(len(s.results)-1)) | ||||
| 			locEncoder.SetChunkSize(chunkSize, uint64(len(s.results)-1)) | ||||
| 
 | ||||
| 			postingsItr := postingsBS.Iterator() | ||||
| 			for postingsItr.HasNext() { | ||||
| 				docNum := uint64(postingsItr.Next()) | ||||
| 
 | ||||
| 				freqNorm := freqNorms[freqNormOffset] | ||||
| 
 | ||||
| 				err = tfEncoder.Add(docNum, | ||||
| 					encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0), | ||||
| 					uint64(math.Float32bits(freqNorm.norm))) | ||||
| 				if err != nil { | ||||
| 					return 0, nil, err | ||||
| 				} | ||||
| 
 | ||||
| 				if freqNorm.numLocs > 0 { | ||||
| 					numBytesLocs := 0 | ||||
| 					for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { | ||||
| 						numBytesLocs += totalUvarintBytes( | ||||
| 							uint64(loc.fieldID), loc.pos, loc.start, loc.end, | ||||
| 							uint64(len(loc.arrayposs)), loc.arrayposs) | ||||
| 					} | ||||
| 
 | ||||
| 					err = locEncoder.Add(docNum, uint64(numBytesLocs)) | ||||
| 					if err != nil { | ||||
| 						return 0, nil, err | ||||
| 					} | ||||
| 
 | ||||
| 					for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { | ||||
| 						err = locEncoder.Add(docNum, | ||||
| 							uint64(loc.fieldID), loc.pos, loc.start, loc.end, | ||||
| 							uint64(len(loc.arrayposs))) | ||||
| 						if err != nil { | ||||
| 							return 0, nil, err | ||||
| 						} | ||||
| 
 | ||||
| 						err = locEncoder.Add(docNum, loc.arrayposs...) | ||||
| 						if err != nil { | ||||
| 							return 0, nil, err | ||||
| 						} | ||||
| 					} | ||||
| 
 | ||||
| 					locOffset += freqNorm.numLocs | ||||
| 				} | ||||
| 
 | ||||
| 				freqNormOffset++ | ||||
| 
 | ||||
| 				docTermMap[docNum] = append( | ||||
| 					append(docTermMap[docNum], term...), | ||||
| 					termSeparator) | ||||
| 			} | ||||
| 
 | ||||
| 			tfEncoder.Close() | ||||
| 			locEncoder.Close() | ||||
| 
 | ||||
| 			postingsOffset, err := | ||||
| 				writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf) | ||||
| 			if err != nil { | ||||
| 				return 0, nil, err | ||||
| 			} | ||||
| 
 | ||||
| 			if postingsOffset > uint64(0) { | ||||
| 				err = s.builder.Insert([]byte(term), postingsOffset) | ||||
| 				if err != nil { | ||||
| 					return 0, nil, err | ||||
| 				} | ||||
| 			} | ||||
| 
 | ||||
| 			tfEncoder.Reset() | ||||
| 			locEncoder.Reset() | ||||
| 		} | ||||
| 
 | ||||
| 		err = s.builder.Close() | ||||
| 		if err != nil { | ||||
| 			return 0, nil, err | ||||
| 		} | ||||
| 
 | ||||
| 		// record where this dictionary starts | ||||
| 		dictOffsets[fieldID] = uint64(s.w.Count()) | ||||
| 
 | ||||
| 		vellumData := s.builderBuf.Bytes() | ||||
| 
 | ||||
| 		// write out the length of the vellum data | ||||
| 		n := binary.PutUvarint(buf, uint64(len(vellumData))) | ||||
| 		_, err = s.w.Write(buf[:n]) | ||||
| 		if err != nil { | ||||
| 			return 0, nil, err | ||||
| 		} | ||||
| 
 | ||||
| 		// write this vellum to disk | ||||
| 		_, err = s.w.Write(vellumData) | ||||
| 		if err != nil { | ||||
| 			return 0, nil, err | ||||
| 		} | ||||
| 
 | ||||
| 		// reset vellum for reuse | ||||
| 		s.builderBuf.Reset() | ||||
| 
 | ||||
| 		err = s.builder.Reset(&s.builderBuf) | ||||
| 		if err != nil { | ||||
| 			return 0, nil, err | ||||
| 		} | ||||
| 
 | ||||
| 		// write the field doc values | ||||
| 		// NOTE: doc values continue to use legacy chunk mode | ||||
| 		chunkSize, err := getChunkSize(LegacyChunkMode, 0, 0) | ||||
| 		if err != nil { | ||||
| 			return 0, nil, err | ||||
| 		} | ||||
| 		fdvEncoder := newChunkedContentCoder(chunkSize, uint64(len(s.results)-1), s.w, false) | ||||
| 		if s.IncludeDocValues[fieldID] { | ||||
| 			for docNum, docTerms := range docTermMap { | ||||
| 				if len(docTerms) > 0 { | ||||
| 					err = fdvEncoder.Add(uint64(docNum), docTerms) | ||||
| 					if err != nil { | ||||
| 						return 0, nil, err | ||||
| 					} | ||||
| 				} | ||||
| 			} | ||||
| 			err = fdvEncoder.Close() | ||||
| 			if err != nil { | ||||
| 				return 0, nil, err | ||||
| 			} | ||||
| 
 | ||||
| 			fdvOffsetsStart[fieldID] = uint64(s.w.Count()) | ||||
| 
 | ||||
| 			_, err = fdvEncoder.Write() | ||||
| 			if err != nil { | ||||
| 				return 0, nil, err | ||||
| 			} | ||||
| 
 | ||||
| 			fdvOffsetsEnd[fieldID] = uint64(s.w.Count()) | ||||
| 
 | ||||
| 			fdvEncoder.Reset() | ||||
| 		} else { | ||||
| 			fdvOffsetsStart[fieldID] = fieldNotUninverted | ||||
| 			fdvOffsetsEnd[fieldID] = fieldNotUninverted | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	fdvIndexOffset = uint64(s.w.Count()) | ||||
| 
 | ||||
| 	for i := 0; i < len(fdvOffsetsStart); i++ { | ||||
| 		n := binary.PutUvarint(buf, fdvOffsetsStart[i]) | ||||
| 		_, err := s.w.Write(buf[:n]) | ||||
| 		if err != nil { | ||||
| 			return 0, nil, err | ||||
| 		} | ||||
| 		n = binary.PutUvarint(buf, fdvOffsetsEnd[i]) | ||||
| 		_, err = s.w.Write(buf[:n]) | ||||
| 		if err != nil { | ||||
| 			return 0, nil, err | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return fdvIndexOffset, dictOffsets, nil | ||||
| } | ||||
| 
 | ||||
| // returns the total # of bytes needed to encode the given uint64's | ||||
| // into binary.PutUVarint() encoding | ||||
| func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) { | ||||
| 	n = numUvarintBytes(a) | ||||
| 	n += numUvarintBytes(b) | ||||
| 	n += numUvarintBytes(c) | ||||
| 	n += numUvarintBytes(d) | ||||
| 	n += numUvarintBytes(e) | ||||
| 	for _, v := range more { | ||||
| 		n += numUvarintBytes(v) | ||||
| 	} | ||||
| 	return n | ||||
| } | ||||
| 
 | ||||
| // returns # of bytes needed to encode x in binary.PutUvarint() encoding | ||||
| func numUvarintBytes(x uint64) (n int) { | ||||
| 	for x >= 0x80 { | ||||
| 		x >>= 7 | ||||
| 		n++ | ||||
| 	} | ||||
| 	return n + 1 | ||||
| } | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue