1 files changed, 222 insertions, 0 deletions
diff --git a/gdb-rhbz-2232086-reduce-size-of-gdb-index.patch b/gdb-rhbz-2232086-reduce-size-of-gdb-index.patch
new file mode 100644
index 0000000..9eaf615
--- /dev/null
+++ b/gdb-rhbz-2232086-reduce-size-of-gdb-index.patch
@@ -0,0 +1,222 @@
+From FEDORA_PATCHES Mon Sep 17 00:00:00 2001
+From: Andrew Burgess <aburgess@redhat.com>
+Date: Fri, 24 Nov 2023 11:50:35 +0000
+Subject: gdb-rhbz-2232086-reduce-size-of-gdb-index.patch
+
+;; Back-port upstream commit aa19bc1d259 as part of a fix for
+;; non-deterministic gdb-index generation (RH BZ 2232086).
+
+gdb: reduce size of generated gdb-index file
+
+I noticed in passing that out algorithm for generating the gdb-index
+file is incorrect.  When building the hash table in add_index_entry we
+count every incoming entry rehash when the number of entries gets too
+large.  However, some of the incoming entries will be duplicates,
+which don't actually result in new items being added to the hash
+table.
+
+As a result, we grow the gdb-index hash table far too often.
+
+With an unmodified GDB, generating a gdb-index for GDB, I see a file
+size of 90M, with a hash usage (in the generated index file) of just
+2.6%.
+
+With a patched GDB, generating a gdb-index for the _same_ GDB binary,
+I now see a gdb-index file size of 30M, with a hash usage of 41.9%.
+
+This is a 67% reduction in gdb-index file size.
+
+Obviously, not every gdb-index file is going to see such big savings,
+however, the larger a program, and the more symbols that are
+duplicated between compilation units, the more GDB would over count,
+and so, over-grow the index.
+
+The gdb-index hash table we create has a minimum size of 1024, and
+then we grow the hash when it is 75% full, doubling the hash table at
+that time.  Given this, then we expect that either:
+
+  a. The hash table is size 1024, and less than 75% full, or
+  b. The hash table is between 37.5% and 75% full.
+
+I've include a test that checks some of these constraints -- I've not
+bothered to check the upper limit, and over full hash table isn't
+really a problem here, but if the fill percentage is less than 37.5%
+then this indicates that we've done something wrong (obviously, I also
+check for the 1024 minimum size).
+
+Approved-By: Tom Tromey <tom@tromey.com>
+
+diff --git a/gdb/dwarf2/index-write.c b/gdb/dwarf2/index-write.c
+--- a/gdb/dwarf2/index-write.c
++++ b/gdb/dwarf2/index-write.c
+@@ -254,20 +254,29 @@ add_index_entry (struct mapped_symtab *symtab, const char *name,
+ 		 int is_static, gdb_index_symbol_kind kind,
+ 		 offset_type cu_index)
+ {
+-  offset_type cu_index_and_attrs;
++  symtab_index_entry *slot = &find_slot (symtab, name);
++  if (slot->name == NULL)
++    {
++      /* This is a new element in the hash table.  */
++      ++symtab->n_elements;
+ 
+-  ++symtab->n_elements;
+-  if (4 * symtab->n_elements / 3 >= symtab->data.size ())
+-    hash_expand (symtab);
++      /* We might need to grow the hash table.  */
++      if (4 * symtab->n_elements / 3 >= symtab->data.size ())
++	{
++	  hash_expand (symtab);
+ 
+-  symtab_index_entry &slot = find_slot (symtab, name);
+-  if (slot.name == NULL)
+-    {
+-      slot.name = name;
++	  /* This element will have a different slot in the new table.  */
++	  slot = &find_slot (symtab, name);
++
++	  /* But it should still be a new element in the hash table.  */
++	  gdb_assert (slot->name == nullptr);
++	}
++
++      slot->name = name;
+       /* index_offset is set later.  */
+     }
+ 
+-  cu_index_and_attrs = 0;
++  offset_type cu_index_and_attrs = 0;
+   DW2_GDB_INDEX_CU_SET_VALUE (cu_index_and_attrs, cu_index);
+   DW2_GDB_INDEX_SYMBOL_STATIC_SET_VALUE (cu_index_and_attrs, is_static);
+   DW2_GDB_INDEX_SYMBOL_KIND_SET_VALUE (cu_index_and_attrs, kind);
+@@ -279,7 +288,7 @@ add_index_entry (struct mapped_symtab *symtab, const char *name,
+      the last entry pushed), but a symbol could have multiple kinds in one CU.
+      To keep things simple we don't worry about the duplication here and
+      sort and uniquify the list after we've processed all symbols.  */
+-  slot.cu_indices.push_back (cu_index_and_attrs);
++  slot->cu_indices.push_back (cu_index_and_attrs);
+ }
+ 
+ /* See symtab_index_entry.  */
+diff --git a/gdb/testsuite/gdb.gdb/index-file.exp b/gdb/testsuite/gdb.gdb/index-file.exp
+new file mode 100644
+--- /dev/null
++++ b/gdb/testsuite/gdb.gdb/index-file.exp
+@@ -0,0 +1,118 @@
++# Copyright 2023 Free Software Foundation, Inc.
++
++# This program is free software; you can redistribute it and/or modify
++# it under the terms of the GNU General Public License as published by
++# the Free Software Foundation; either version 3 of the License, or
++# (at your option) any later version.
++#
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++# GNU General Public License for more details.
++#
++# You should have received a copy of the GNU General Public License
++# along with this program.  If not, see <http://www.gnu.org/licenses/>.
++
++# Load the GDB executable, and then 'save gdb-index', and make some
++# checks of the generated index file.
++
++load_lib selftest-support.exp
++
++# Can't save an index with readnow.
++if {[readnow]} {
++    untested "cannot create an index when readnow is in use"
++    return -1
++}
++
++# A multiplier used to ensure slow tasks are less likely to timeout.
++set timeout_factor 20
++
++set filename [selftest_prepare]
++if { $filename eq "" } {
++    unsupported "${gdb_test_file_name}.exp"
++    return -1
++}
++
++with_timeout_factor $timeout_factor {
++    # Start GDB, load FILENAME.
++    clean_restart $filename
++}
++
++# Generate an index file.
++set dir1 [standard_output_file "index_1"]
++remote_exec host "mkdir -p ${dir1}"
++with_timeout_factor $timeout_factor {
++    gdb_test_no_output "save gdb-index $dir1" \
++	"create gdb-index file"
++}
++
++# Close GDB.
++gdb_exit
++
++# Validate that the index-file FILENAME has made efficient use of its
++# symbol hash table.  Calculate the number of symbols in the hash
++# table and the total hash table size.  The hash table starts with
++# 1024 entries, and then doubles each time it is filled to 75%.  At
++# 75% filled, doubling the size takes it to 37.5% filled.
++#
++# Thus, the hash table is correctly filled if:
++#  1. Its size is 1024 (i.e. it has not yet had its first doubling), or
++#  2. Its filled percentage is over 37%
++#
++# We could check that it is not over filled, but I don't as that's not
++# really an issue.  But we did once have a bug where the table was
++# doubled incorrectly, in which case we'd see a filled percentage of
++# around 2% in some cases, which is a huge waste of disk space.
++proc check_symbol_table_usage { filename } {
++    # Open the file in binary mode and read-only mode.
++    set fp [open $filename rb]
++
++    # Configure the channel to use binary translation.
++    fconfigure $fp -translation binary
++
++    # Read the first 8 bytes of the file, which contain the header of
++    # the index section.
++    set header [read $fp [expr 7 * 4]]
++
++    # Scan the header to get the version, the CU list offset, and the
++    # types CU list offset.
++    binary scan $header iiiiii version \
++	_ _ _ symbol_table_offset shortcut_offset
++
++    # The length of the symbol hash table (in entries).
++    set len [expr ($shortcut_offset - $symbol_table_offset) / 8]
++
++    # Now walk the hash table and count how many entries are in use.
++    set offset $symbol_table_offset
++    set count 0
++    while { $offset < $shortcut_offset } {
++	seek $fp $offset
++	set entry [read $fp 8]
++	binary scan $entry ii name_ptr flags
++	if { $name_ptr != 0 } {
++	    incr count
++	}
++
++	incr offset 8
++    }
++
++    # Close the file.
++    close $fp
++
++    # Calculate how full the cache is.
++    set pct [expr (100 * double($count)) / $len]
++
++    # Write our results out to the gdb.log.
++    verbose -log "Hash table size: $len"
++    verbose -log "Hash table entries: $count"
++    verbose -log "Percentage usage: $pct%"
++
++    # The minimum fill percentage is actually 37.5%, but we give TCL a
++    # little flexibility in case the FP maths give a result a little
++    # off.
++    gdb_assert { $len == 1024 || $pct > 37 } \
++	"symbol hash table usage"
++}
++
++set index_filename_base [file tail $filename]
++check_symbol_table_usage "$dir1/${index_filename_base}.gdb-index"