Reading the contents of an ELF section(programmatically) - c

I am trying to retrieve the contents of an additional section within an ELF binary. At this point, I'm using the following code to retrieve the name of each section:
#include <stdio.h>
#include <unistd.h>
#include <stdint.h>
#include <stdlib.h>
#pragma pack(push,1)
#pragma pack(pop)
#define EI_NIDENT 16
/* 32-bit ELF base types. */
typedef unsigned int Elf32_Addr;
typedef unsigned short Elf32_Half;
typedef unsigned int Elf32_Off;
typedef signed int Elf32_Sword;
typedef unsigned int Elf32_Word;
/* 64-bit ELF base types. */
typedef unsigned long long Elf64_Addr;
typedef unsigned short Elf64_Half;
typedef signed short Elf64_SHalf;
typedef unsigned long long Elf64_Off;
typedef signed int Elf64_Sword;
typedef unsigned int Elf64_Word;
typedef unsigned long long Elf64_Xword;
typedef signed long long Elf64_Sxword;
typedef struct elf32_hdr{
unsigned char e_ident[EI_NIDENT];
Elf32_Half e_type;
Elf32_Half e_machine;
Elf32_Word e_version;
Elf32_Addr e_entry; /* Entry point */
Elf32_Off e_phoff;
Elf32_Off e_shoff;
Elf32_Word e_flags;
Elf32_Half e_ehsize;
Elf32_Half e_phentsize;
Elf32_Half e_phnum;
Elf32_Half e_shentsize;
Elf32_Half e_shnum;
Elf32_Half e_shstrndx;
} Elf32_Ehdr;
typedef struct elf32_shdr {
Elf32_Word sh_name;
Elf32_Word sh_type;
Elf32_Word sh_flags;
Elf32_Addr sh_addr;
Elf32_Off sh_offset;
Elf32_Word sh_size;
Elf32_Word sh_link;
Elf32_Word sh_info;
Elf32_Word sh_addralign;
Elf32_Word sh_entsize;
} Elf32_Shdr;
typedef struct elf64_hdr {
unsigned char e_ident[EI_NIDENT]; /* ELF "magic number" */
Elf64_Half e_type;
Elf64_Half e_machine;
Elf64_Word e_version;
Elf64_Addr e_entry; /* Entry point virtual address */
Elf64_Off e_phoff; /* Program header table file offset */
Elf64_Off e_shoff; /* Section header table file offset */
Elf64_Word e_flags;
Elf64_Half e_ehsize;
Elf64_Half e_phentsize;
Elf64_Half e_phnum;
Elf64_Half e_shentsize;
Elf64_Half e_shnum;
Elf64_Half e_shstrndx;
} Elf64_Ehdr;
typedef struct elf64_shdr {
Elf64_Word sh_name; /* Section name, index in string tbl */
Elf64_Word sh_type; /* Type of section */
Elf64_Xword sh_flags; /* Miscellaneous section attributes */
Elf64_Addr sh_addr; /* Section virtual addr at execution */
Elf64_Off sh_offset; /* Section file offset */
Elf64_Xword sh_size; /* Size of section in bytes */
Elf64_Word sh_link; /* Index of another section */
Elf64_Word sh_info; /* Additional section information */
Elf64_Xword sh_addralign; /* Section alignment */
Elf64_Xword sh_entsize; /* Entry size if section holds table */
} Elf64_Shdr;
int main(int argc, char **argv)
{
FILE* ElfFile = NULL;
char* SectNames = NULL;
Elf64_Ehdr elfHdr;
Elf64_Shdr sectHdr;
uint32_t idx;
if(argc != 2) {
printf("usage: %s <ELF_FILE>\n", argv[0]);
exit(1);
}
if((ElfFile = fopen(argv[1], "r")) == NULL) {
perror("[E] Error opening file:");
exit(1);
}
// read ELF header, first thing in the file
fread(&elfHdr, 1, sizeof(Elf64_Ehdr), ElfFile);
// read section name string table
// first, read its header.
/*
e_shoff This member holds the section header table's file offset
in bytes. If the file has no section header table, this
member holds zero.
e_shstrndx This member holds the section header table index of the
entry associated with the section name string table. If
the file has no section name string table, this member
holds the value SHN_UNDEF.
If the index of section name string table section is
larger than or equal to SHN_LORESERVE (0xff00), this
member holds SHN_XINDEX (0xffff) and the real index of
the section name string table section is held in the
sh_link member of the initial entry in section header
table. Otherwise, the sh_link member of the initial
entry in section header table contains the value zero.
SHN_UNDEF This value marks an undefined, missing,
irrelevant, or otherwise meaningless
section reference. For example, a symbol
"defined" relative to section number
SHN_UNDEF is an undefined symbol.
SHN_LORESERVE This value specifies the lower bound of the
range of reserved indices.
SHN_LOPROC Values greater than or equal to SHN_HIPROC
are reserved for processor-specific
semantics.
SHN_HIPROC Values less than or equal to SHN_LOPROC are
reserved for processor-specific semantics.
SHN_ABS This value specifies absolute values for
the corresponding reference. For example,
symbols defined relative to section number
SHN_ABS have absolute values and are not
affected by relocation.
SHN_COMMON Symbols defined relative to this section
are common symbols, such as Fortran COMMON
or unallocated C external variables.
SHN_HIRESERVE This value specifies the upper bound of the
range of reserved indices between
SHN_LORESERVE and SHN_HIRESERVE, inclusive;
the values do not reference the section
header table. That is, the section header
table does not contain entries for the
reserved indices.
*/
fseek(ElfFile, elfHdr.e_shoff + elfHdr.e_shstrndx * sizeof(sectHdr), SEEK_SET);
fread(&sectHdr, 1, sizeof(sectHdr), ElfFile);
/*
sh_size This member holds the section's size in bytes. Unless the
section type is SHT_NOBITS, the section occupies sh_size
bytes in the file. A section of type SHT_NOBITS may have a
nonzero size, but it occupies no space in the file.
sh_offset This member's value holds the byte offset from the
beginning of the file to the first byte in the section.
One section type, SHT_NOBITS, occupies no space in the
file, and its sh_offset member locates the conceptual
placement in the file.
e_shnum This member holds the number of entries in the section
header table. Thus the product of e_shentsize and
e_shnum gives the section header table's size in bytes.
If a file has no section header table, e_shnum holds the
value of zero.
If the number of entries in the section header table is
larger than or equal to SHN_LORESERVE (0xff00), e_shnum
holds the value zero and the real number of entries in
the section header table is held in the sh_size member of
the initial entry in section header table. Otherwise,
the sh_size member of the initial entry in the section
header table holds the value zero.
sh_name This member specifies the name of the section. Its value
is an index into the section header string table section,
giving the location of a null-terminated string.
*/
// next, read the section, string data
// printf("sh_size = %llu\n", sectHdr.sh_size);
SectNames = malloc(sectHdr.sh_size);
fseek(ElfFile, sectHdr.sh_offset, SEEK_SET);
fread(SectNames, 1, sectHdr.sh_size, ElfFile);
// read all section headers
for (idx = 0; idx < elfHdr.e_shnum; idx++)
{
const char* name = "";
fseek(ElfFile, elfHdr.e_shoff + idx * sizeof(sectHdr), SEEK_SET);
fread(&sectHdr, 1, sizeof(sectHdr), ElfFile);
// print section name
if (sectHdr.sh_name);
name = SectNames + sectHdr.sh_name;
printf("%2u %s\n", idx, name);
}
return 0;
}
Running readelf on a "hello world" binary produces the following output:
$ readelf -S helloworld
There are 30 section headers, starting at offset 0x1170:
Section Headers:
[Nr] Name Type Address Offset
Size EntSize Flags Link Info Align
[ 0] NULL 0000000000000000 00000000
0000000000000000 0000000000000000 0 0 0
[ 1] .interp PROGBITS 0000000000400238 00000238
000000000000001c 0000000000000000 A 0 0 1
[ 2] .note.ABI-tag NOTE 0000000000400254 00000254
0000000000000020 0000000000000000 A 0 0 4
[ 3] .note.gnu.build-i NOTE 0000000000400274 00000274
0000000000000024 0000000000000000 A 0 0 4
[ 4] .gnu.hash GNU_HASH 0000000000400298 00000298
000000000000001c 0000000000000000 A 5 0 8
[ 5] .dynsym DYNSYM 00000000004002b8 000002b8
0000000000000060 0000000000000018 A 6 1 8
[ 6] .dynstr STRTAB 0000000000400318 00000318
000000000000003d 0000000000000000 A 0 0 1
[ 7] .gnu.version VERSYM 0000000000400356 00000356
0000000000000008 0000000000000002 A 5 0 2
[ 8] .gnu.version_r VERNEED 0000000000400360 00000360
0000000000000020 0000000000000000 A 6 1 8
[ 9] .rela.dyn RELA 0000000000400380 00000380
0000000000000018 0000000000000018 A 5 0 8
[10] .rela.plt RELA 0000000000400398 00000398
0000000000000048 0000000000000018 A 5 12 8
[11] .init PROGBITS 00000000004003e0 000003e0
000000000000001a 0000000000000000 AX 0 0 4
[12] .plt PROGBITS 0000000000400400 00000400
0000000000000040 0000000000000010 AX 0 0 16
[13] .text PROGBITS 0000000000400440 00000440
0000000000000182 0000000000000000 AX 0 0 16
[14] .fini PROGBITS 00000000004005c4 000005c4
0000000000000009 0000000000000000 AX 0 0 4
[15] .rodata PROGBITS 00000000004005d0 000005d0
0000000000000013 0000000000000000 A 0 0 4
[16] .eh_frame_hdr PROGBITS 00000000004005e4 000005e4
0000000000000034 0000000000000000 A 0 0 4
[17] .eh_frame PROGBITS 0000000000400618 00000618
00000000000000f4 0000000000000000 A 0 0 8
[18] .init_array INIT_ARRAY 0000000000600e10 00000e10
0000000000000008 0000000000000000 WA 0 0 8
[19] .fini_array FINI_ARRAY 0000000000600e18 00000e18
0000000000000008 0000000000000000 WA 0 0 8
[20] .jcr PROGBITS 0000000000600e20 00000e20
0000000000000008 0000000000000000 WA 0 0 8
[21] .dynamic DYNAMIC 0000000000600e28 00000e28
00000000000001d0 0000000000000010 WA 6 0 8
[22] .got PROGBITS 0000000000600ff8 00000ff8
0000000000000008 0000000000000008 WA 0 0 8
[23] .got.plt PROGBITS 0000000000601000 00001000
0000000000000030 0000000000000008 WA 0 0 8
[24] .data PROGBITS 0000000000601030 00001030
0000000000000010 0000000000000000 WA 0 0 8
[25] .bss NOBITS 0000000000601040 00001040
0000000000000008 0000000000000000 WA 0 0 1
[26] .comment PROGBITS 0000000000000000 00001040
0000000000000024 0000000000000001 MS 0 0 1
[27] .shstrtab STRTAB 0000000000000000 00001064
0000000000000108 0000000000000000 0 0 1
[28] .symtab SYMTAB 0000000000000000 000018f0
0000000000000618 0000000000000018 29 45 8
[29] .strtab STRTAB 0000000000000000 00001f08
000000000000023c 0000000000000000 0 0 1
Key to Flags:
W (write), A (alloc), X (execute), M (merge), S (strings), l (large)
I (info), L (link order), G (group), T (TLS), E (exclude), x (unknown)
O (extra OS processing required) o (OS specific), p (processor specific)
Examining the following entry:
[13] .text PROGBITS 0000000000400440 00000440
0000000000000182 0000000000000000 AX 0 0 16
in order to retrieve the .text section entirely from the binary is it sufficient to read 0x182 bytes from the address 0x400440 + 0x440 from the file? where 0x182 is the section size and 0x400440 is the address and 0x440 is the offset? Also, what is the role of the alignment(0x16) here?

To extract .text section you need to copy 0x182 (Size) bytes starting from 0x440 (Offset) address in your binary file.
Ignore 0x400440 (Address) value, it has nothing to do with file addresses, it's address in RAM memory where your .text section will be copied by loader. From ELF format specification:
sh_addr: If the section will appear in the memory image of a process, this member gives the
address at which the section’s first byte should reside. Otherwise, the member contains 0.
Align value is actually decimal, not hexadecimal. So it's 16, not 0x16. Alignment means that section address must be multiple of 16 (bytes).
You can verify all this, exploring the binary by yourself. First, observe disassemble of your binary:
$ objdump -D your-file | less
Find where .text starts and then look at .text section data. Now just make a dumb hexdump operation:
$ hexdump -C your-file | less
Now find the Offset address and look at bytes starting from this address. You will find out they are the same bytes as from disassembled .text section.
Conclusion: you need to use Offset value (from readelf output) when working with your file, not Address value.

Related

How does objcopy compute what sections for the elf file to insert into the output file?

let's say I run arm-none-eabi-objcopy firmwared.elf -O ihex firmware.hex
Assume the binary was generated with the following linker script:
ENTRY(Reset_Handler)
MEMORY
{
FLASH (RX) : ORIGIN = 0x08020000, LENGTH = 896K
SRAM (RWX) : ORIGIN = 0x20000000, LENGTH = 512K
BKPSRAM (RW) : ORIGIN = 0x40024000, LENGTH = 4K
}
_estack = 0x20080000;
SECTIONS
{
.isr_vector :
{
. = ALIGN(4);
_isr_vector = .;
KEEP(*(.isr_vector))
. = ALIGN(4);
} > FLASH
.firmware_header_vector :
{
. = ALIGN(4);
KEEP(*(.firmware_header_vector))
. = ALIGN(4);
} > FLASH
.text :
{
. = ALIGN(4);
_stext = .;
*(.Reset_Handler)
*(.text)
*(.text*)
*(.rodata)
*(.rodata*)
*(.glue_7)
*(.glue_7t)
KEEP(*(.init))
KEEP(*(.fini))
. = ALIGN(4);
_etext = .;
} > FLASH
.ARM.extab :
{
. = ALIGN(4);
*(.ARM.extab)
*(.gnu.linkonce.armextab.*)
. = ALIGN(4);
} > FLASH
.exidx :
{
. = ALIGN(4);
PROVIDE(__exidx_start = .);
*(.ARM.exidx*)
. = ALIGN(4);
PROVIDE(__exidx_end = .);
} > FLASH
.preinit_array :
{
PROVIDE(__preinit_array_start = .);
KEEP(*(.preinit_array*))
PROVIDE(__preinit_array_end = .);
} > FLASH
.init_array :
{
PROVIDE(__init_array_start = .);
KEEP(*(SORT(.init_array.*)))
KEEP(*(.init_array*))
PROVIDE(__init_array_end = .);
} > FLASH
.fini_array :
{
PROVIDE(__fini_array_start = .);
KEEP(*(.fini_array*))
KEEP(*(SORT(.fini_array.*)))
PROVIDE(__fini_array_end = .);
} > FLASH
_sidata = .;
.data_x : AT(_sidata) /* LMA address is _sidata (in FLASH) */
{
. = ALIGN(4);
_sdata = .; /* data section VMA address */
*(.data*)
. = ALIGN(4);
_edata = .;
} > SRAM
.firmware_header (_sidata + SIZEOF(.data_x)):
{
. = ALIGN(4);
KEEP(*(.firmware_header))
. = ALIGN(4);
} > FLASH
.eth (NOLOAD) :
{
. = ALIGN(4);
KEEP(*(.RxDecripSection))
KEEP(*(.TxDescripSection))
KEEP(*(.RxarraySection))
KEEP(*(.TxarraySection))
. = ALIGN(4);
} > SRAM
.bss :
{
. = ALIGN(4);
_sbss = .;
PROVIDE(__bss_start__ = _sbss);
*(.bss)
*(.bss*)
*(COMMON)
. = ALIGN(4);
_ebss = .;
PROVIDE(__bss_end__ = _ebss);
} > SRAM
PROVIDE(end = .);
.heap (NOLOAD) :
{
. = ALIGN(4);
PROVIDE(__heap_start__ = .);
KEEP(*(.heap))
. = ALIGN(4);
PROVIDE(__heap_end__ = .);
} > SRAM
.reserved_for_stack (NOLOAD) :
{
. = ALIGN(4);
PROVIDE(__reserved_for_stack_start__ = .);
KEEP(*(.reserved_for_stack))
. = ALIGN(4);
PROVIDE(__reserved_for_stack_end__ = .);
} > SRAM
.battery_backed_sram (NOLOAD) :
{
. = ALIGN(4);
KEEP(*(.battery_backed_sram))
. = ALIGN(4);
} > BKPSRAM
/DISCARD/ :
{
*(.ARM.attributes)
}
}
This results in a readelf output of:
Section Headers:
[Nr] Name Type Addr Off Size ES Flg Lk Inf Al
[ 0] NULL 00000000 000000 000000 00 0 0 0
[ 1] .isr_vector PROGBITS 08020000 010000 0001f8 00 WA 0 0 4
[ 2] .firmware_header_ PROGBITS 080201f8 0101f8 000004 00 WA 0 0 4
[ 3] .text PROGBITS 08020200 010200 021b44 00 AX 0 0 64
[ 4] .ARM.extab PROGBITS 08041d44 042728 000000 00 W 0 0 1
[ 5] .exidx ARM_EXIDX 08041d44 031d44 000008 00 AL 3 0 4
[ 6] .init_array INIT_ARRAY 08041d4c 031d4c 000008 04 WA 0 0 4
[ 7] .fini_array FINI_ARRAY 08041d54 031d54 000004 04 WA 0 0 4
[ 8] .data_x PROGBITS 20000000 040000 0009c8 00 WA 0 0 8
[ 9] .firmware_header PROGBITS 08042720 042720 000008 00 WA 0 0 4
[10] .eth NOBITS 200009c8 0509c8 0030a0 00 WA 0 0 4
[11] .bss NOBITS 20003a68 0509c8 045da4 00 WA 0 0 4
[12] .heap PROGBITS 2004980c 042728 000000 00 W 0 0 1
[13] .reserved_for_sta PROGBITS 2004980c 042728 000000 00 W 0 0 1
[14] .battery_backed_s NOBITS 40024000 044000 00000c 00 WA 0 0 4
[15] .comment PROGBITS 00000000 042728 000075 01 MS 0 0 1
[16] .debug_frame PROGBITS 00000000 0427a0 00144c 00 0 0 4
[17] .stab PROGBITS 00000000 043bec 000084 0c 18 0 4
[18] .stabstr STRTAB 00000000 043c70 000117 00 0 0 1
[19] .symtab SYMTAB 00000000 043d88 009b00 10 20 1787 4
[20] .strtab STRTAB 00000000 04d888 0042bb 00 0 0 1
[21] .shstrtab STRTAB 00000000 051b43 0000e6 00 0 0 1
Key to Flags:
W (write), A (alloc), X (execute), M (merge), S (strings), I (info),
L (link order), O (extra OS processing required), G (group), T (TLS),
C (compressed), x (unknown), o (OS specific), E (exclude),
y (purecode), p (processor specific)
There are no section groups in this file.
Program Headers:
Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align
LOAD 0x010000 0x08020000 0x08020000 0x21d58 0x21d58 RWE 0x10000
LOAD 0x040000 0x20000000 0x08041d58 0x009c8 0x009c8 RW 0x10000
LOAD 0x042720 0x08042720 0x08042720 0x00008 0x00008 RW 0x10000
LOAD 0x0509c8 0x200009c8 0x08042720 0x00000 0x48e44 RW 0x10000
LOAD 0x044000 0x40024000 0x40024000 0x00000 0x0000c RW 0x10000
Section to Segment mapping:
Segment Sections...
00 .isr_vector .firmware_header_vector .text .exidx .init_array .fini_array
01 .data_x
02 .firmware_header
03 .eth .bss
04 .battery_backed_sram
How does objcopy know not to insert sections such as .bss in to the output image? I know that it computes this on the fly, and I'm assuming this mechanism is driven through the section to segment mapping, but I cannot find any explanation as to how it actually performs the segment to section mapping. The elf file stores no information about which segments are flash, and yet somehow objcopy knows that it should not copy .bss into the output file. How?
The 'A' flag in the Flg column of the readelf output, which ELF calls SHF_ALLOC, indicates a section that 'occupies memory during process execution'.
SHF_ALLOC : The section occupies memory during process execution. Some
control sections do not reside in the memory image of an object file;
this attribute is off for those sections.
http://refspecs.linuxbase.org/elf/elf.pdf
Typically this applies to both program and data memory, and in a 'normal' OS environment the OS loads the SHF_ALLOC sections to the indicated addresses (ignoring or using for other purposes the non-SHF_ALLOC sections as appropriate) and everything's ready to go.
In an embedded environment with ROM program memory, only SHF_ALLOC sections mapping to addresses in ROM need to be output.
(This question illustrates a case in which going by address alone without taking SHF_ALLOC into account is not enough.)
However:
Depending on how the linker produced the ELF file, initialised data sections may or may not require additional handling.
Ideally the linker produces two sections related to initialised data: one in RAM (the runtime data area), and one in ROM (the initialisation contents for the RAM data). Startup code references the addresses of these sections and copies the init data to the RAM area. In this case simply outputting SHF_ALLOC sections in the ROM address range will automatically produce correct results.
If the linker instead produces a single section as if for a normal OS, then when generating the ROM image the data section needs to be identified and emitted at an address in ROM (and the startup code needs to be able to find that address).
Most decent toolchains take the former approach if configured correctly, but I've certainly worked with the latter as well.
If you want an algorithm that mimics what objcopy -O binary does you need to look at the section headers. Unlike as is often suggested, just looking at segment headers is not enough.
Quote from the objcopy man page:
objcopy can be used to generate a raw binary file by using an output target of binary (e.g., use -O binary). When objcopy generates a raw binary file, it will essentially produce a memory dump of the contents of the input object file. All symbols and relocation information will be discarded. The memory dump will start at the load address of the lowest section copied into the output file.
I think the last sentence says very deliberately 'section', as this really is what happens: objcopy goes through all the sections and discards those that do not go into a raw binary file. The remaining sections are written to the output file, sorted by ascending address. Space between sections in the output file is filled with zeros.
For an own project where I need to convert ELF files to raw binaries and where I do not want to depend on objcopy I came up with the following:
If a section is of type SHT_NULL or SHT_NOBITS, then the section is not included. This follows from the ELF specification (https://refspecs.linuxbase.org/elf/elf.pdf):
SHT_NULL: "This value marks the section header as inactive; it does not have an associated section. Other members of the section header have undefined values."
SHT_NOBITS: "A section of this type occupies no space in the file but otherwise resembles SHT_PROGBITS. Although this section contains no bytes, the sh_offset member contains the conceptual file offset."
If the sh_addr field of a section header is 0, then the section is not included. Quote from the ELF specification: "If the section will appear in the memory image of a process, this member
gives the address at which the section's first byte should reside. Otherwise,
the member contains 0."
If the sh_size field of a section header is 0, then the section is not included. Quote from the ELF specification: "This member gives the section's size in bytes. Unless the section type is SHT_NOBITS, the section occupies sh_size bytes in the file. A section of type SHT_NOBITS may have a non-zero size, but it occupies no space
in the file
If the sh_flags field of a section header doesn't have the SHF_ALLOCflag set, then the section is not included. Quote from the ELF specification: "The section occupies memory during process execution. Some control sections do not reside in the memory image of an object file; this attribute is off for those sections."
Step 3 is somewhat redundant, but for me it nicely filters out sections whose size is zero, so that I do not have to deal with these during later stages.
This works for me, although I haven't thrown many ELF files at it yet. Also note that this is guesswork to some extent. I thought about reading the objcopy source code, but that leads very quickly into the bowels of libbfd.
I have no idea if this is the real way to do this, but this is how I ended up solving this:
# For each program header, get the sections contained by it
# For each section, calculate the LMA the section will reside at
# Do NOT load a section if...
# Section type is SHT_NULL or NOBITS
# Section size = 0
# The LMA is outside of the isr_vector -> header region
Note that in my case, I had a section capping off the end of the image. This made it very obvious as to exactly where the image ended.

Advantage of different data section in c [duplicate]

When checking the disassembly of the object file through the readelf, I see the data and the bss segments contain the same offset address.
The data section will contain the initialized global and static variables. BSS will contain un-initialized global and static variables.
1 #include<stdio.h>
2
3 static void display(int i, int* ptr);
4
5 int main(){
6 int x = 5;
7 int* xptr = &x;
8 printf("\n In main() program! \n");
9 printf("\n x address : 0x%x x value : %d \n",(unsigned int)&x,x);
10 printf("\n xptr points to : 0x%x xptr value : %d \n",(unsigned int)xptr,*xptr);
11 display(x,xptr);
12 return 0;
13 }
14
15 void display(int y,int* yptr){
16 char var[7] = "ABCDEF";
17 printf("\n In display() function \n");
18 printf("\n y value : %d y address : 0x%x \n",y,(unsigned int)&y);
19 printf("\n yptr points to : 0x%x yptr value : %d \n",(unsigned int)yptr,*yptr);
20 }
output:
SSS:~$ size a.out
text data bss dec hex filename
1311 260 8 1579 62b a.out
Here in the above program I don't have any un-initialized data but the BSS has occupied 8 bytes. Why does it occupy 8 bytes?
Also when I disassemble the object file,
EDITED :
[ 3] .data PROGBITS 00000000 000110 000000 00 WA 0 0 4
[ 4] .bss NOBITS 00000000 000110 000000 00 WA 0 0 4
[ 5] .rodata PROGBITS 00000000 000110 0000cf 00 A 0 0 4
data, rodata and bss has the same offset address. Does it mean the rodata, data and bss refer to the same address?
Do Data section, rodata section and the bss section contain the data values in the same address, if so how to distinguish the data section, bss section and rodata section?
The .bss section is guaranteed to be all zeros when the program is loaded into memory. So any global data that is uninitialized, or initialized to zero is placed in the .bss section. For example:
static int g_myGlobal = 0; // <--- in .bss section
The nice part about this is, the .bss section data doesn't have to be included in the ELF file on disk (ie. there isn't a whole region of zeros in the file for the .bss section). Instead, the loader knows from the section headers how much to allocate for the .bss section, and simply zero it out before handing control over to your program.
Notice the readelf output:
[ 3] .data PROGBITS 00000000 000110 000000 00 WA 0 0 4
[ 4] .bss NOBITS 00000000 000110 000000 00 WA 0 0 4
.data is marked as PROGBITS. That means there are "bits" of program data in the ELF file that the loader needs to read out into memory for you. .bss on the other hand is marked NOBITS, meaning there's nothing in the file that needs to be read into memory as part of the load.
Example:
// bss.c
static int g_myGlobal = 0;
int main(int argc, char** argv)
{
return 0;
}
Compile it with $ gcc -m32 -Xlinker -Map=bss.map -o bss bss.c
Look at the section headers with $ readelf -S bss
Section Headers:
[Nr] Name Type Addr Off Size ES Flg Lk Inf Al
[ 0] NULL 00000000 000000 000000 00 0 0 0
:
[13] .text PROGBITS 080482d0 0002d0 000174 00 AX 0 0 16
:
[24] .data PROGBITS 0804964c 00064c 000004 00 WA 0 0 4
[25] .bss NOBITS 08049650 000650 000008 00 WA 0 0 4
:
Now we look for our variable in the symbol table: $ readelf -s bss | grep g_myGlobal
37: 08049654 4 OBJECT LOCAL DEFAULT 25 g_myGlobal
Note that g_myGlobal is shown to be a part of section 25. If we look back in the section headers, we see that 25 is .bss.
To answer your real question:
Here in the above program I dont have any un-intialised data but the BSS has occupied 8 bytes. Why does it occupy 8 bytes ?
Continuing with my example, we look for any symbol in section 25:
$ readelf -s bss | grep 25
9: 0804825c 0 SECTION LOCAL DEFAULT 9
25: 08049650 0 SECTION LOCAL DEFAULT 25
32: 08049650 1 OBJECT LOCAL DEFAULT 25 completed.5745
37: 08049654 4 OBJECT LOCAL DEFAULT 25 g_myGlobal
The third column is the size. We see our expected 4-byte g_myGlobal, and this 1-byte completed.5745. This is probably a function-static variable from somewhere in the C runtime initialization - remember, a lot of "stuff" happens before main() is ever called.
4+1=5 bytes. However, if we look back at the .bss section header, we see the last column Al is 4. That is the section alignment, meaning this section, when loaded, will always be a multiple of 4 bytes. The next multiple up from 5 is 8, and that's why the .bss section is 8 bytes.
Additionally We can look at the map file generated by the linker to see what object files got placed where in the final output.
.bss 0x0000000008049650 0x8
*(.dynbss)
.dynbss 0x0000000000000000 0x0 /usr/lib/gcc/x86_64-redhat-linux/4.7.2/../../../../lib/crt1.o
*(.bss .bss.* .gnu.linkonce.b.*)
.bss 0x0000000008049650 0x0 /usr/lib/gcc/x86_64-redhat-linux/4.7.2/../../../../lib/crt1.o
.bss 0x0000000008049650 0x0 /usr/lib/gcc/x86_64-redhat-linux/4.7.2/../../../../lib/crti.o
.bss 0x0000000008049650 0x1 /usr/lib/gcc/x86_64-redhat-linux/4.7.2/32/crtbegin.o
.bss 0x0000000008049654 0x4 /tmp/ccKF6q1g.o
.bss 0x0000000008049658 0x0 /usr/lib/libc_nonshared.a(elf-init.oS)
.bss 0x0000000008049658 0x0 /usr/lib/gcc/x86_64-redhat-linux/4.7.2/32/crtend.o
.bss 0x0000000008049658 0x0 /usr/lib/gcc/x86_64-redhat-linux/4.7.2/../../../../lib/crtn.o
Again, the third column is the size.
We see 4 bytes of .bss came from /tmp/ccKF6q1g.o. In this trivial example, we know that is the temporary object file from the compilation of our bss.c file. The other 1 byte came from crtbegin.o, which is part of the C runtime.
Finally, since we know that this 1 byte mystery bss variable is from crtbegin.o, and it's named completed.xxxx, it's real name is completed and it's probably a static inside some function. Looking at crtstuff.c we find the culprit: a static _Bool completed inside of __do_global_dtors_aux().
By definition, the bss segment takes some place in memory (when the program starts) but don't need any disk space. You need to define some variable to get it filled, so try
int bigvar_in_bss[16300];
int var_in_data[5] = {1,2,3,4,5};
Your simple program might not have any data in .bss, and shared libraries (like libc.so) may have "their own .bss"
File offsets and memory addresses are not easily related.
Read more about the ELF specification, also use /proc/ (eg cat /proc/self/maps would display the address space of the cat process running that command).
Read also proc(5)

How to get the first address of initialized data segment

my program is working on linux using gcc. Through the manual page, I find edata, which represent the first address past the end of the initialized data segment.
But I want know the first address of initialized data segment
How can I get it?
I have tried treating etext as the first address of initialized data segment. Then I got a segment fault when I increase the address and access the variable stored in it. I think some address space between etext and edata was not mapped into virtual memory. Is that right?
That depends on your linker scripts. For example on some platforms you have the symbol __bss_start at the beginning of BSS. It's a symbol without any data associated with it, you can get a pointer to it by extern declaring a variable with that name (only for the sake of taking the address of that variable). For example:
#include <stdio.h>
extern char __bss_start;
int main()
{
printf("%p\n", &__bss_start);
return 0;
}
You find this by looking in the linker script, for example in /usr/lib/ldscripts/elf_x64_64.x:
.data :
{
*(.data .data.* .gnu.linkonce.d.*)
SORT(CONSTRUCTORS)
}
.data1 : { *(.data1) }
_edata = .; PROVIDE (edata = .);
__bss_start = .; /* <<<<< this is what you're looking for /*
.bss :
{
*(.dynbss)
*(.bss .bss.* .gnu.linkonce.b.*)
*(COMMON)
/* Align here to ensure that the .bss section occupies space up to
_end. Align after .bss to ensure correct alignment even if the
.bss section disappears because there are no input sections.
FIXME: Why do we need it? When there is no .bss section, we don't
pad the .data section. */
. = ALIGN(. != 0 ? 64 / 8 : 1);
}
You can also see the edata you mentioned, but as edata is not reserved for the implementation (the PROVIDE means only to create this symbol if it otherwise isn't used) you should probably use _edata instead.
If you want the address to the start of the data section you could modify the linker script:
__data_start = . ;
.data :
{
*(.data .data.* .gnu.linkonce.d.*)
SORT(CONSTRUCTORS)
}
.data1 : { *(.data1) }
_edata = .; PROVIDE (edata = .);
__bss_start = .; /* <<<<< this is what you're looking for /*
.bss :
{
*(.dynbss)
*(.bss .bss.* .gnu.linkonce.b.*)
*(COMMON)
/* Align here to ensure that the .bss section occupies space up to
_end. Align after .bss to ensure correct alignment even if the
.bss section disappears because there are no input sections.
FIXME: Why do we need it? When there is no .bss section, we don't
pad the .data section. */
. = ALIGN(. != 0 ? 64 / 8 : 1);
}
You probably want to make a copy of the linker script (look for the right one in /usr/lib/ldscripts, they are different depending on what kind of output you're targeting) and supply it when you compile:
gcc -o execfile source.c -Wl,-T ldscript
Another option if you don't want to modify the linker script could be to use the __executable_start and parse the ELF headers (hoping that the executable is sufficiently linearly mapped)
As for _etext, it is the end of the text section (you can read that in the linker script as well, but I didn't include it in the excerpt), but the text section is followed by rodata, trying to write there is probably going to segfault.
You can use the linux tool size (binutils package in Debian/Ubuntu).
Example
size -A /usr/bin/gcc
results in
/usr/bin/gcc :
section size addr
.interp 28 4194928
.note.ABI-tag 32 4194956
.note.gnu.build-id 36 4194988
.gnu.hash 240 4195024
.dynsym 4008 4195264
.dynstr 2093 4199272
.gnu.version 334 4201366
.gnu.version_r 160 4201704
.rela.dyn 720 4201864
.rela.plt 3240 4202584
.init 14 4205824
.plt 2176 4205840
.text 384124 4208016
.fini 9 4592140
.rodata 303556 4592160
.eh_frame_hdr 8540 4895716
.eh_frame 50388 4904256
.gcc_except_table 264 4954644
.tbss 16 7052632
.init_array 16 7052632
.fini_array 8 7052648
.jcr 8 7052656
.data.rel.ro 3992 7052672
.dynamic 480 7056664
.got 216 7057144
.got.plt 1104 7057384
.data 2520 7058496
.bss 80976 7061024
.gnu_debuglink 12 0
Total 849310

Interesting binary dump of executable file

For some reason I made simple program in C to output binary representation of given input:
int main()
{
char c;
while(read(0,&c,1) > 0)
{
unsigned char cmp = 128;
while(cmp)
{
if(c & cmp)
write(1,"1",1);
else
write(1,"0",1);
cmp >>= 1;
}
}
return 0;
}
After compilation:
$ gcc bindump.c -o bindump
I made simple test to check if program is able to print binary:
$ cat bindump | ./bindump | fold -b100 | nl
Output is following: http://pastebin.com/u7SasKDJ
I suspected the output to look like random series of ones and zeroes. However, output partially seems to be quite more interesting. For example take a look at the output between line 171 and 357. I wonder why there are lots of zeros in compare to other sections of executable ?
My architecture is:
$ lscpu
Architecture: i686
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 4
On-line CPU(s) list: 0-3
Thread(s) per core: 2
Core(s) per socket: 2
Socket(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 28
Stepping: 10
CPU MHz: 1000.000
BogoMIPS: 3325.21
Virtualization: VT-x
L1d cache: 24K
L1i cache: 32K
L2 cache: 512K
When you compile a program into an executable on Linux (and a number of other unix systems), it is written in the ELF format. The ELF format has a number of sections, which you can examine with readelf or objdump:
readelf -a bindump | less
For example, section .text contains CPU instructions, .data global variables, .bss uninitialized global variables (it is actually empty in the ELF file itself, but is created in the main memory when the program is executed), .plt and .got which are jump tables, debugging information, etc.
Btw. it is much more convenient to examine the binary content of files with hexdump:
hexdump -C bindata | less
There you can see that starting with offset 0x850 (approx. line 171 in your dump) there is a lot of zeros, and you can also see the ASCII representation on the right.
Let us look at which sections correspond to the block of your interest between 0x850 and 0x1160 (the field Off – offset in the file is important here):
> readelf -a bindata
...
Section Headers:
[Nr] Name Type Addr Off Size ES Flg Lk Inf Al
...
[28] .shstrtab STRTAB 00000000 00074c 000106 00 0 0 1
[29] .symtab SYMTAB 00000000 000d2c 000440 10 30 45 4
...
You can examine the content of an individual section with -x:
> readelf -x .symtab bindump | less
0x00000000 00000000 00000000 00000000 00000000 ................
0x00000010 00000000 34810408 00000000 03000100 ....4...........
0x00000020 00000000 48810408 00000000 03000200 ....H...........
0x00000030 00000000 68810408 00000000 03000300 ....h...........
0x00000040 00000000 8c810408 00000000 03000400 ................
0x00000050 00000000 b8810408 00000000 03000500 ................
0x00000060 00000000 d8810408 00000000 03000600 ................
You would see that there are many zeros. The section is composed of 18-byte values (= one line in the -x output) defining symbols. From readelf -a you can see that it has 68 entries, and first 27 of them (excl. the very first one) are of type SECTION:
Symbol table '.symtab' contains 68 entries:
Num: Value Size Type Bind Vis Ndx Name
0: 00000000 0 NOTYPE LOCAL DEFAULT UND
1: 08048134 0 SECTION LOCAL DEFAULT 1
2: 08048148 0 SECTION LOCAL DEFAULT 2
3: 08048168 0 SECTION LOCAL DEFAULT 3
4: 0804818c 0 SECTION LOCAL DEFAULT 4
...
According to the specification (page 1-18), each entry has the following format:
typedef struct {
Elf32_Word st_name;
Elf32_Addr st_value;
Elf32_Word st_size;
unsigned char st_info;
unsigned char st_other;
Elf32_Half st_shndx;
} Elf32_Sym;
Without going into too much detail here, I think what matters here is that st_name and st_size are both zeros for these SECTION entries. Both are 32-bit numbers, which means lots of zeros in this particular section.
This is not really a programming question, but however...
A binary normally consists of different sections: code, data, debugging info, etc. Since these sections contents differ by type, I would pretty much expect them to look different.
I.e. the symbol table consists of address offsets in your binary. If I read your lspci correctly, you are on a 32-bit system. That means Each offset has four bytes, and given the size of your program, in most cases two of those bytes will be zero. And there are more effects like this.
You didn't strip your program, that means there's still lots of information (symbol table etc.) present in the binary. Try stripping the binary and have a look at it again.

Difference between data section and the bss section in C

When checking the disassembly of the object file through the readelf, I see the data and the bss segments contain the same offset address.
The data section will contain the initialized global and static variables. BSS will contain un-initialized global and static variables.
1 #include<stdio.h>
2
3 static void display(int i, int* ptr);
4
5 int main(){
6 int x = 5;
7 int* xptr = &x;
8 printf("\n In main() program! \n");
9 printf("\n x address : 0x%x x value : %d \n",(unsigned int)&x,x);
10 printf("\n xptr points to : 0x%x xptr value : %d \n",(unsigned int)xptr,*xptr);
11 display(x,xptr);
12 return 0;
13 }
14
15 void display(int y,int* yptr){
16 char var[7] = "ABCDEF";
17 printf("\n In display() function \n");
18 printf("\n y value : %d y address : 0x%x \n",y,(unsigned int)&y);
19 printf("\n yptr points to : 0x%x yptr value : %d \n",(unsigned int)yptr,*yptr);
20 }
output:
SSS:~$ size a.out
text data bss dec hex filename
1311 260 8 1579 62b a.out
Here in the above program I don't have any un-initialized data but the BSS has occupied 8 bytes. Why does it occupy 8 bytes?
Also when I disassemble the object file,
EDITED :
[ 3] .data PROGBITS 00000000 000110 000000 00 WA 0 0 4
[ 4] .bss NOBITS 00000000 000110 000000 00 WA 0 0 4
[ 5] .rodata PROGBITS 00000000 000110 0000cf 00 A 0 0 4
data, rodata and bss has the same offset address. Does it mean the rodata, data and bss refer to the same address?
Do Data section, rodata section and the bss section contain the data values in the same address, if so how to distinguish the data section, bss section and rodata section?
The .bss section is guaranteed to be all zeros when the program is loaded into memory. So any global data that is uninitialized, or initialized to zero is placed in the .bss section. For example:
static int g_myGlobal = 0; // <--- in .bss section
The nice part about this is, the .bss section data doesn't have to be included in the ELF file on disk (ie. there isn't a whole region of zeros in the file for the .bss section). Instead, the loader knows from the section headers how much to allocate for the .bss section, and simply zero it out before handing control over to your program.
Notice the readelf output:
[ 3] .data PROGBITS 00000000 000110 000000 00 WA 0 0 4
[ 4] .bss NOBITS 00000000 000110 000000 00 WA 0 0 4
.data is marked as PROGBITS. That means there are "bits" of program data in the ELF file that the loader needs to read out into memory for you. .bss on the other hand is marked NOBITS, meaning there's nothing in the file that needs to be read into memory as part of the load.
Example:
// bss.c
static int g_myGlobal = 0;
int main(int argc, char** argv)
{
return 0;
}
Compile it with $ gcc -m32 -Xlinker -Map=bss.map -o bss bss.c
Look at the section headers with $ readelf -S bss
Section Headers:
[Nr] Name Type Addr Off Size ES Flg Lk Inf Al
[ 0] NULL 00000000 000000 000000 00 0 0 0
:
[13] .text PROGBITS 080482d0 0002d0 000174 00 AX 0 0 16
:
[24] .data PROGBITS 0804964c 00064c 000004 00 WA 0 0 4
[25] .bss NOBITS 08049650 000650 000008 00 WA 0 0 4
:
Now we look for our variable in the symbol table: $ readelf -s bss | grep g_myGlobal
37: 08049654 4 OBJECT LOCAL DEFAULT 25 g_myGlobal
Note that g_myGlobal is shown to be a part of section 25. If we look back in the section headers, we see that 25 is .bss.
To answer your real question:
Here in the above program I dont have any un-intialised data but the BSS has occupied 8 bytes. Why does it occupy 8 bytes ?
Continuing with my example, we look for any symbol in section 25:
$ readelf -s bss | grep 25
9: 0804825c 0 SECTION LOCAL DEFAULT 9
25: 08049650 0 SECTION LOCAL DEFAULT 25
32: 08049650 1 OBJECT LOCAL DEFAULT 25 completed.5745
37: 08049654 4 OBJECT LOCAL DEFAULT 25 g_myGlobal
The third column is the size. We see our expected 4-byte g_myGlobal, and this 1-byte completed.5745. This is probably a function-static variable from somewhere in the C runtime initialization - remember, a lot of "stuff" happens before main() is ever called.
4+1=5 bytes. However, if we look back at the .bss section header, we see the last column Al is 4. That is the section alignment, meaning this section, when loaded, will always be a multiple of 4 bytes. The next multiple up from 5 is 8, and that's why the .bss section is 8 bytes.
Additionally We can look at the map file generated by the linker to see what object files got placed where in the final output.
.bss 0x0000000008049650 0x8
*(.dynbss)
.dynbss 0x0000000000000000 0x0 /usr/lib/gcc/x86_64-redhat-linux/4.7.2/../../../../lib/crt1.o
*(.bss .bss.* .gnu.linkonce.b.*)
.bss 0x0000000008049650 0x0 /usr/lib/gcc/x86_64-redhat-linux/4.7.2/../../../../lib/crt1.o
.bss 0x0000000008049650 0x0 /usr/lib/gcc/x86_64-redhat-linux/4.7.2/../../../../lib/crti.o
.bss 0x0000000008049650 0x1 /usr/lib/gcc/x86_64-redhat-linux/4.7.2/32/crtbegin.o
.bss 0x0000000008049654 0x4 /tmp/ccKF6q1g.o
.bss 0x0000000008049658 0x0 /usr/lib/libc_nonshared.a(elf-init.oS)
.bss 0x0000000008049658 0x0 /usr/lib/gcc/x86_64-redhat-linux/4.7.2/32/crtend.o
.bss 0x0000000008049658 0x0 /usr/lib/gcc/x86_64-redhat-linux/4.7.2/../../../../lib/crtn.o
Again, the third column is the size.
We see 4 bytes of .bss came from /tmp/ccKF6q1g.o. In this trivial example, we know that is the temporary object file from the compilation of our bss.c file. The other 1 byte came from crtbegin.o, which is part of the C runtime.
Finally, since we know that this 1 byte mystery bss variable is from crtbegin.o, and it's named completed.xxxx, it's real name is completed and it's probably a static inside some function. Looking at crtstuff.c we find the culprit: a static _Bool completed inside of __do_global_dtors_aux().
By definition, the bss segment takes some place in memory (when the program starts) but don't need any disk space. You need to define some variable to get it filled, so try
int bigvar_in_bss[16300];
int var_in_data[5] = {1,2,3,4,5};
Your simple program might not have any data in .bss, and shared libraries (like libc.so) may have "their own .bss"
File offsets and memory addresses are not easily related.
Read more about the ELF specification, also use /proc/ (eg cat /proc/self/maps would display the address space of the cat process running that command).
Read also proc(5)

Resources