diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 000000000000..a777844a958c --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,29 @@ +name: Docs +on: + push: + branches: + - master + paths: + - docs/* + pull_request: + paths: + - docs/* +jobs: + pages: + runs-on: ubuntu-22.04 + permissions: + pages: write + id-token: write + steps: + - name: git checkout + uses: actions/checkout@v4 + - name: Install dependencies + run: pip install sphinx-design sphinxawesome-theme rstfmt + - name: Check formatting + run: rstfmt --check -w 100 docs/source + - name: Publish + if: github.event_name == 'push' + uses: sphinx-notes/pages@v3 + with: + checkout: false + documentation_path: docs/source diff --git a/docs/input-filter.md b/docs-old/input-filter.md similarity index 100% rename from docs/input-filter.md rename to docs-old/input-filter.md diff --git a/docs/mailinglist-rules.md b/docs-old/mailinglist-rules.md similarity index 100% rename from docs/mailinglist-rules.md rename to docs-old/mailinglist-rules.md diff --git a/docs/output-api.md b/docs-old/output-api.md similarity index 100% rename from docs/output-api.md rename to docs-old/output-api.md diff --git a/docs/parameter-parsing-api.md b/docs-old/parameter-parsing-api.md similarity index 100% rename from docs/parameter-parsing-api.md rename to docs-old/parameter-parsing-api.md diff --git a/docs/release-process.md b/docs-old/release-process.md similarity index 100% rename from docs/release-process.md rename to docs-old/release-process.md diff --git a/docs/self-contained-extensions.md b/docs-old/self-contained-extensions.md similarity index 100% rename from docs/self-contained-extensions.md rename to docs-old/self-contained-extensions.md diff --git a/docs/streams.md b/docs-old/streams.md similarity index 100% rename from docs/streams.md rename to docs-old/streams.md diff --git a/docs/unix-build-system.md b/docs-old/unix-build-system.md similarity index 100% rename from docs/unix-build-system.md rename to docs-old/unix-build-system.md diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 000000000000..567609b1234a --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +build/ diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 000000000000..515705d98bcd --- /dev/null +++ b/docs/README.md @@ -0,0 +1,20 @@ +# php-src docs + +This is the home of the php-src internal documentation. It is in very early stages, but is intended +to become the primary place where new information about php-src is documented. Over time, it is +expected to replace various mediums like: + +* https://www.phpinternalsbook.com/ +* https://wiki.php.net/internals +* Blogs from contributors + +## How to build + +`python` 3 and `pip` are required. + +```bash +pip install sphinx sphinx-design sphinxawesome-theme +make html +``` + +That's it! You can view the documentation under `./build/html/index.html` in your browser. diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 000000000000..747ffb7b3033 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/_static/.gitkeep b/docs/source/_static/.gitkeep new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/docs/source/_templates/.gitkeep b/docs/source/_templates/.gitkeep new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 000000000000..2eb75d509cc0 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,59 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +from dataclasses import asdict +from sphinxawesome_theme import ThemeOptions +from sphinxawesome_theme.postprocess import Icons +from sphinx.highlighting import lexers +from pygments.lexers.web import PhpLexer + +lexers['php'] = PhpLexer(startinline=True) +lexers['php-annotations'] = PhpLexer(startinline=True) + +project = 'php-src docs' +author = 'The PHP Group' +extensions = [ + 'sphinx_design', + 'sphinxawesome_theme.highlighting', +] +templates_path = ['_templates'] +html_theme = 'sphinxawesome_theme' +html_static_path = ['_static'] +html_title = project +html_permalinks_icon = Icons.permalinks_icon +theme_options = ThemeOptions( + show_prev_next=True, + extra_header_link_icons={ + 'repository on GitHub': { + 'link': 'https://github.com/php/php-src', + 'icon': ( + '' + '' + ), + }, + }, +) +html_theme_options = asdict(theme_options) +pygments_style = 'sphinx' diff --git a/docs/source/core/data-structures/index.rst b/docs/source/core/data-structures/index.rst new file mode 100644 index 000000000000..327e025b92c7 --- /dev/null +++ b/docs/source/core/data-structures/index.rst @@ -0,0 +1,12 @@ +################# + Data structures +################# + +.. toctree:: + :hidden: + + zval + reference-counting + zend_string + +This section provides an overview of the core data structures used in php-src. diff --git a/docs/source/core/data-structures/reference-counting.rst b/docs/source/core/data-structures/reference-counting.rst new file mode 100644 index 000000000000..77432f249967 --- /dev/null +++ b/docs/source/core/data-structures/reference-counting.rst @@ -0,0 +1,228 @@ +#################### + Reference counting +#################### + +In languages like C, when you need memory for storing data for an indefinite period of time or in a +large amount, you call ``malloc`` and ``free`` to acquire and release blocks of memory of some size. +This sounds simple on the surface but turns out to be quite tricky, mainly because the data may not +be freed for as long as it is used anywhere in the program. Sometimes this makes it unclear who is +responsible for freeing the memory, and when to do so. Failure to handle this correctly may result +in a use-after-free, double-free, or memory leak. + +In PHP you usually do not need to think about memory management. The engine takes care of allocating +and freeing memory for you by tracking which values are no longer needed. It does this by assigning +a reference count to each allocated value, often abbreviated as refcount or RC. Whenever a reference +to a value is passed somewhere else, its reference count is increased to indicate the value is now +used by another party. When the party no longer needs the value, it is responsible for decreasing +the reference count. Once the reference count reaches zero, we know the value is no longer needed +anywhere, and that it may be freed. + +.. code:: php + + $a = new stdClass; // RC 1 + $b = $a; // RC 2 + unset($a); // RC 1 + unset($b); // RC 0, free + +Reference counting is needed for types that store auxiliary data, which are the following: + +- Strings +- Arrays +- Objects +- References +- Resources + +These are either reference types (objects, references and resources) or they are large types that +don't fit in a single ``zend_value`` directly (strings, arrays). Simpler types either don't store a +value at all (``null``, ``false``, ``true``) or their value is small enough to fit directly in +``zend_value`` (``int``, ``float``). + +All of the reference counted types share a common initial struct sequence. + +.. code:: c + + typedef struct _zend_refcounted_h { + uint32_t refcount; /* reference counter 32-bit */ + union { + uint32_t type_info; + } u; + } zend_refcounted_h; + + struct _zend_string { + zend_refcounted_h gc; + // ... + }; + + struct _zend_array { + zend_refcounted_h gc; + // ... + }; + +The ``zend_refcounted_h`` struct is simple. It contains the reference count, and a ``type_info`` +field that repeats some of the type information that is also stored in the ``zval``, for situations +where we're not dealing with a ``zval`` directly. It also stores some additional fields, described +under `GC flags`_. + +******** + Macros +******** + +As with ``zval``, ``zend_refcounted_h`` members should not be accessed directly. Instead, you should +use the provided macros. There are macros that work with reference counted types directly, prefixed +with ``GC_``, or macros that work on ``zval`` values, usually prefixed with ``Z_``. Unfortunately, +naming is not always consistent. + +.. list-table:: ``zval`` macros + :header-rows: 1 + + - - Macro + - Non-RC [#non-rc]_ + - Description + + - - ``Z_REFCOUNT[_P]`` + - No + - Returns the reference count. + + - - ``Z_ADDREF[_P]`` + - No + - Increases the reference count. + + - - ``Z_TRY_ADDREF[_P]`` + - Yes + - Increases the reference count. May be called on any ``zval``. + + - - ``zval_ptr_dtor`` + - Yes + - Decreases the reference count and frees the value if the reference count reaches zero. + +.. [#non-rc] + + Whether the macro works with non-reference counted types. If it does, the operation is usually a + no-op. If it does not, using the macro on these values is undefined behavior. + +.. list-table:: ``zend_refcounted_h`` macros + :header-rows: 1 + + - - Macro + - Immutable [#immutable]_ + - Description + + - - ``GC_REFCOUNT[_P]`` + - Yes + - Returns the reference count. + + - - ``GC_ADDREF[_P]`` + - No + - Increases the reference count. + + - - ``GC_TRY_ADDREF[_P]`` + - Yes + - Increases the reference count. + + - - ``GC_DTOR[_P]`` + - Yes + - Decreases the reference count and frees the value if the reference count reaches zero. + +.. [#immutable] + + Whether the macro works with immutable types, described under `Immutable reference counted types`_. + +************ + Separation +************ + +PHP has value and reference types. Reference types are types that are shared through a reference, a +"pointer" to the value, rather than the value itself. Modifying such a value in one place changes it +for all of its observers. For example, writing to a property changes the property in every place the +object is referenced. Value types, on the other hand, are copied when passed to another party. +Modifying the original value does not affect the copy, and vice versa. + +In PHP, arrays and strings are value types. Since they are also reference counted types, this +requires some special care when modifying values. In particular, we need to make sure that modifying +the value is not observable from other places. Modifying a value with RC 1 is unproblematic, since +we are the values sole owner. However, if the value has a reference count of >1, we need to create a +fresh copy before modifying it. This process is called separation or CoW (copy on write). + +.. code:: php + + $a = [1, 2, 3]; // RC 1 + $b = $a; // RC 2 + $b[] = 4; // Separation, $a RC 1, $b RC 1 + var_dump($a); // [1, 2, 3] + var_dump($b); // [1, 2, 3, 4] + +*********************************** + Immutable reference counted types +*********************************** + +Sometimes, even a reference counted type is not reference counted. When PHP runs in a multi-process +or multi-threaded environment with opcache enabled, it shares some common values between processes +or threads to reduce memory consumption. As you may know, sharing memory between processes or +threads can be tricky and requires special care when modifying values. In particular, modification +usually requires exclusive access to the memory so that the other processes or threads wait until +the value is done being updated. In this case, this synchronization is avoided by making the value +immutable and never modifying the reference count. Such values will receive the ``GC_IMMUTABLE`` +flag in their ``gc->u.type_info`` field. + +Some macros like ``GC_TRY_ADDREF`` will guard against immutable values. You should not use immutable +values on some macros, like ``GC_ADDREF``. This will result in undefined behavior, because the macro +will not check whether the value is immutable before performing the reference count modifications. +You may execute PHP with the ``-d opcache.protect_memory=1`` flag to mark the shared memory as +read-only and trigger a hardware exception if the code accidentally attempts to modify it. + +***************** + Cycle collector +***************** + +Sometimes, reference counting is not enough. Consider the following example: + +.. code:: php + + $a = new stdClass; + $b = new stdClass; + $a->b = $b; + $b->a = $a; + unset($a); + unset($b); + +When this code finishes, the reference count of both instances of ``stdClass`` will still be 1, as +they reference each other. This is called a reference cycle. + +PHP implements a cycle collector that detects such cycles and frees values that are only reachable +through their own references. The cycle collector will record values that may be involved in a +cycle, and run when this buffer becomes full. It is also possible to invoke it explicitly by calling +the ``gc_collect_cycles()`` function. The cycle collectors design is described in the `Cycle +collector `_ chapter. + +********** + GC flags +********** + +.. code:: c + + /* zval_gc_flags(zval.value->gc.u.type_info) (common flags) */ + #define GC_NOT_COLLECTABLE (1<<4) + #define GC_PROTECTED (1<<5) /* used for recursion detection */ + #define GC_IMMUTABLE (1<<6) /* can't be changed in place */ + #define GC_PERSISTENT (1<<7) /* allocated using malloc */ + #define GC_PERSISTENT_LOCAL (1<<8) /* persistent, but thread-local */ + +The ``GC_NOT_COLLECTABLE`` flag indicates that the value may not be involved in a reference cycle. +This allows for a fast way to detect values that don't need to be added to the cycle collector +buffer. Only arrays and objects may actually be involved in reference cycles. + +The ``GC_PROTECTED`` flag is used to protect against recursion in various internal functions. For +example, ``var_dump`` recursively prints the contents of values, and marks visited values with the +``GC_PROTECTED`` flag. If the value is recursive, it prevents the same value from being visited +again. + +``GC_IMMUTABLE`` has been discussed in `Immutable reference counted types`_. + +The ``GC_PERSISTENT`` flag indicates that the value was allocated using ``malloc``, instead of PHPs +own allocator. Usually, such values are alive for the entire lifetime of the process, instead of +being freed at the end of the request. See the `Zend allocator `_ chapter for more +information. + +The ``GC_PERSISTENT_LOCAL`` flag indicates that a ``CG_PERSISTENT`` value is only accessibly in one +thread, and is thus still safe to modify. This flag is only used in debug builds to satisfy an +``assert``. diff --git a/docs/source/core/data-structures/zend_string.rst b/docs/source/core/data-structures/zend_string.rst new file mode 100644 index 000000000000..2b07611e3425 --- /dev/null +++ b/docs/source/core/data-structures/zend_string.rst @@ -0,0 +1,196 @@ +############# + zend_string +############# + +In C, strings are represented as sequential lists of characters, ``char*`` or ``char[]``. The end of +the string is usually indicated by the special NUL character, ``'\0'``. This comes with a few +significant downsides: + +- Calculating the length of the string is expensive, as it requires walking the entire string to + look for the terminating NUL character. +- The string may not contain the NUL character itself. +- It is easy to run into buffer overflows if the NUL byte is accidentally missing. + +php-src uses the ``zend_string`` struct as an abstraction over ``char*``, which explicitly stores +the strings length, along with some other fields. It looks as follows: + +.. code:: c + + struct _zend_string { + zend_refcounted_h gc; + zend_ulong h; /* hash value */ + size_t len; + char val[1]; + }; + +The ``gc`` field is used for :doc:`./reference-counting`. The ``h`` field contains a hash value, +which is used for `hash table `__ lookups. The ``len`` field stores the length of the string +in bytes, and the ``val`` field contains the actual string data. + +You may wonder why the ``val`` field is declared as ``char val[1]``. This is called the `struct +hack`_ in C. It is used to create structs with a flexible size, namely by allowing the last element +to be expanded arbitrarily. In this case, the size of ``zend_string`` depends on the strings length, +which is determined at runtime (see ``_ZSTR_STRUCT_SIZE``). When allocating the string, we append +enough bytes to the allocation to hold the strings content. + +.. _struct hack: https://www.geeksforgeeks.org/struct-hack/ + +Here's a basic example of how to use ``zend_string``: + +.. code:: c + + // Allocate the string. + zend_string *string = ZSTR_INIT_LITERAL("Hello world!", /* persistent */ false); + // Write it to the output buffer. + zend_write(ZSTR_VAL(string), ZSTR_LEN(string)); + // Decrease the reference count and free it if necessary. + zend_string_release(string); + +``ZSTR_INIT_LITERAL`` creates a ``zend_string`` from a string literal. It is just a wrapper around +``zend_string_init(char *string, size_t length, bool persistent)`` that provides the length of the +string at compile time. The ``persistent`` parameter indicates whether the string is allocated using +``malloc`` (``persistent == true``) or ``emalloc``, `PHPs custom allocator `__ (``persistent +== false``) that is emptied after each request. + +When you're done using the string, you must call ``zend_string_release``, or the memory will leak. +``zend_string_release`` will automatically call ``malloc`` or ``emalloc``, depending on how the +string was allocated. After releasing the string, you must not access any of its fields anymore, as +it may have been freed if you were its last user. + +***** + API +***** + +The string API is defined in ``Zend/zend_string.h``. It provides a number of functions for creating +new strings. + +.. list-table:: ``zend_string`` creation + :header-rows: 1 + + - - Function/Macro [#persistent]_ + - Description + + - - ``ZSTR_INIT(s, p)`` + - Creates a new string from a string literal. + + - - ``zend_string_init(s, l, p)`` + - Creates a new string from a character buffer. + + - - ``zend_string_alloc(l, p)`` + - Creates a new string of a given length without initializing its content. + + - - ``zend_string_concat2(s1, l1, s2, l2)`` + - Creates a non-persistent string by concatenating two character buffers. + + - - ``zend_string_concat3(...)`` + - Same as ``zend_string_concat2``, but for three character buffers. + + - - ``ZSTR_EMPTY_ALLOC()`` + - Gets an immutable, empty string. This does not allocate memory. + + - - ``ZSTR_CHAR(char)`` + - Gets an immutable, single-character string. This does not allocate memory. + + - - ``ZSTR_KNOWN(ZEND_STR_const)`` + + - Gets an immutable, predefined string. Used for string common within PHP itself, e.g. + ``"class"``. See ``ZEND_KNOWN_STRINGS`` in ``Zend/zend_string.h``. This does not allocate + memory. + +.. [#persistent] + + ``s`` = ``zend_string``, ``l`` = ``length``, ``p`` = ``persistent``. + +As per php-src fashion, you are not supposed to access the ``zend_string`` fields directly. Instead, +use the following macros. There are macros for both ``zend_string`` and ``zvals`` known to contain +strings. + +.. list-table:: Accessor macros + :header-rows: 1 + + - - ``zend_string`` + - ``zval`` + - Description + + - - ``ZSTR_LEN`` + - ``Z_STRLEN[_P]`` + - Returns the length of the string in bytes. + + - - ``ZSTR_VAL`` + - ``Z_STRVAL[_P]`` + - Returns the string data as a ``char*``. + + - - ``ZSTR_HASH`` + - ``Z_STRHASH[_P]`` + - Computes the string has if it hasn't already been, and returns it. + + - - ``ZSTR_H`` + - \- + - Returns the string hash. This macro assumes that the hash has already been computed. + +.. list-table:: Reference counting macros + :header-rows: 1 + + - - Macro + - Description + + - - ``zend_string_copy(s)`` + - Increases the reference count and returns the same string. The reference count is not + increased if the string is interned. + + - - ``zend_string_release(s)`` + - Decreases the reference count and frees the string if it goes to 0. + + - - ``zend_string_dup(s, p)`` + - Creates a true copy of the string in a new allocation, except if the string is interned. + + - - ``zend_string_separate(s)`` + - Duplicates the string if the reference count is greater than 1. See + :doc:`./reference-counting` for details. + + - - ``zend_string_realloc(s, l, p)`` + + - Changes the size of the string. If the string has a reference count greater than 1 or if + the string is interned, a new string is created. You must always use the return value of + this function, as the original array may have been moved to a new location in memory. + +There are various functions to compare strings. The ``zend_string_equals`` function compares two +strings in full, while ``zend_string_starts_with`` checks whether the first argument starts with the +second. There are variations for ``_ci`` and ``_literal``, i.e. case-insensitive comparison and +literal strings, respectively. We won't go over all variations here, as they are straightforward to +use. + +****************** + Interned strings +****************** + +Programs use some strings many times. For example, if your program declares a class called +``MyClass``, it would be wasteful to allocate a new string ``"MyClass"`` every time it is referenced +within your program. Instead, when repeated strings are expected, php-src uses a technique called +string interning. Essentially, this is just a simple `HashTable `__ where existing interned +strings are stored. When creating a new interned string, php-src first checks the interned string +buffer. If it finds it there, it can return a pointer to the existing string. If it doesn't, it +allocates a new string and adds it to the buffer. + +.. code:: c + + zend_string *str1 = zend_new_interned_string( + ZSTR_INIT_LITERAL("MyClass", /* persistent */ false)); + + // In some other place entirely. + zend_string *str2 = zend_new_interned_string( + ZSTR_INIT_LITERAL("MyClass", /* persistent */ false)); + + assert(ZSTR_IS_INTERNED(str1)); + assert(ZSTR_IS_INTERNED(str2)); + assert(str1 == str2); + +Interned strings are *not* reference counted, as they are expected to live for the entire request, +or longer. + +With opcache, this goes one step further by sharing strings across different processes. For example, +if you're using php-fpm with 8 workers, all workers will share the same interned strings buffer. It +gets a bit more complicated. During requests, no interned strings are actually created. Instead, +this is delayed until the script is persisted to shared memory. This means that +``zend_new_interned_string`` may not actually return an interned string if opcache is enabled. +Usually you don't have to worry about this. diff --git a/docs/source/core/data-structures/zval.rst b/docs/source/core/data-structures/zval.rst new file mode 100644 index 000000000000..3fba7e7d2373 --- /dev/null +++ b/docs/source/core/data-structures/zval.rst @@ -0,0 +1,225 @@ +###### + zval +###### + +PHP is a dynamic language. A variable can typically contain a value of any type, and the type of the +variable may even change during the execution of the program. Under the hood, this is implemented +through the ``zval`` struct. It is one of the most important data structures in php-src. It is +implemented as a "tagged union", meaning it stores what type of value it contains, and the value +itself. Let's look at the type first. + +************ + zval types +************ + +.. code:: c + + #define IS_UNDEF 0 /* A variable that was never written to. */ + #define IS_NULL 1 + #define IS_FALSE 2 + #define IS_TRUE 3 + #define IS_LONG 4 /* An integer value. */ + #define IS_DOUBLE 5 /* A floating point value. */ + #define IS_STRING 6 + #define IS_ARRAY 7 + #define IS_OBJECT 8 + #define IS_RESOURCE 9 + #define IS_REFERENCE 10 + +These simple integer constants determine what value is currently stored in a variable. If you are a +PHP developer, these types should sound fairly familiar. They are pretty much an exact reflection of +the types you may use in regular PHP code. One small oddity is that ``IS_FALSE`` and ``IS_TRUE`` are +implemented as separate types, instead of as a ``IS_BOOL`` type. + +Some of these types are self-contained, they don't store any auxiliary data. This includes +``IS_UNDEF``, ``IS_NULL``, ``IS_FALSE`` and ``IS_TRUE``. For the rest of the types, we are going to +require some additional memory to store the actual value of the variable. + +************ + zend_value +************ + +.. code:: c + + typedef union _zend_value { + zend_long lval; /* long value, i.e. int. */ + double dval; /* double value, i.e. float. */ + zend_refcounted *counted; + zend_string *str; + zend_array *arr; + zend_object *obj; + zend_resource *res; + zend_reference *ref; + // Less important for now. + zend_ast_ref *ast; + zval *zv; + void *ptr; + zend_class_entry *ce; + zend_function *func; + struct { + uint32_t w1; + uint32_t w2; + } ww; + } zend_value; + +A C union is a data type that may store any one of its members at a time, by being (at least) as big +as its biggest member. For example, ``zend_value`` may store the ``lval`` member, or the ``dval`` +member, but never both at the same time. However, it doesn't know which member is being stored. +Remembering this is our job, and that's exactly what the ``IS_*`` constants are for. + +The top members of ``zend_value`` mostly mirror the ``IS_*`` constants, with the exception of +``counted``. ``counted`` polymorphically refers to any `reference counted `__ value, including +strings, arrays, objects, resources and references. ``null`` and ``bool`` are missing from +``zend_value`` because their types are self-contained. + +The rest of the fields aren't important for now. + +****** + zval +****** + +Together, the value and the tag make up the ``zval``, along with some other fields. It may look +intimidating at first. We'll go over it step by step. + +.. code:: c + + typedef struct _zval_struct zval; + + struct _zval_struct { + zend_value value; + union { + uint32_t type_info; + struct { + ZEND_ENDIAN_LOHI_3( + uint8_t type, /* active type */ + uint8_t type_flags, + union { + uint16_t extra; /* not further specified */ + } u) + } v; + } u1; + union { + uint32_t next; /* hash collision chain */ + uint32_t cache_slot; /* cache slot (for RECV_INIT) */ + uint32_t opline_num; /* opline number (for FAST_CALL) */ + uint32_t lineno; /* line number (for ast nodes) */ + uint32_t num_args; /* arguments number for EX(This) */ + uint32_t fe_pos; /* foreach position */ + uint32_t fe_iter_idx; /* foreach iterator index */ + uint32_t guard; /* recursion and single property guard */ + uint32_t constant_flags; /* constant flags */ + uint32_t extra; /* not further specified */ + } u2; + }; + +``zval.value`` reserves space for the actual variable data, as discussed above. + +``zval.u1`` stores the variable type, the given ``IS_*`` constant, along with some other flags. It's +definition looks a bit complicated. You can think of the entire field as a 4 bit integer, split into +3 parts. ``v.type`` stores the actual variable type, ``v.type_flags`` is used for some `reference +counting `__ flags, and ``v.u.extra`` is pretty much unused. + +``zval.u2`` defines some more storage for various contexts that is often unoccupied. It's there +because the memory would otherwise be wasted due to padding, so we may as well make use of it. We'll +go over the relevant ones in their corresponding chapters. + +******** + Macros +******** + +The fields in ``zval`` should never be accessed directly. Instead, there are a plethora of macros to +access them, concealing some of the implementation details of the ``zval`` struct. For many macros, +there's a ``_P``-suffixed variant that performs the same operation on a pointer to the given +``zval``. + +.. list-table:: ``zval`` macros + :header-rows: 1 + + - - Macro + - Description + - - ``Z_TYPE[_P]`` + - Access the ``zval.u1.v.type`` part of the type flags, containing the ``IS_*`` type. + - - ``Z_LVAL[_P]`` + - Access the underlying ``int`` value. + - - ``Z_DVAL[_P]`` + - Access the underlying ``float`` value. + - - ``Z_STR[_P]`` + - Access the underlying ``zend_string`` pointer. + - - ``Z_STRVAL[_P]`` + - Access the strings raw ``char *`` pointer. + - - ``Z_STRLEN[_P]`` + - Access the strings length. + - - ``ZVAL_COPY_VALUE(t, s)`` + - Copy one ``zval`` to another, including type and value. + - - ``ZVAL_COPY(t, s)`` + - Same as ``ZVAL_COPY_VALUE``, but if the value is reference counted, increase the counter. + +.. + _todo: There are many more. + +****************** + Other zval types +****************** + +``zval``\ s are sometimes used internally with types that don't exist in userland. + +.. code:: c + + #define IS_CONSTANT_AST 11 + #define IS_INDIRECT 12 + #define IS_PTR 13 + #define IS_ALIAS_PTR 14 + #define _IS_ERROR 15 + +``IS_CONSTANT_AST`` is used to represent constant values (the right hand side of ``const``, +property/parameter initializers, etc.) before they are evaluated. The evaluation of a constant +expression is not always possible during compilation, because they may contain references to values +only available at runtime. Until that evaluation is possible, the constants contain the AST of the +expression rather than the concrete values. Check the `parser `__ chapter for more information +on ASTs. When this flag is set, the ``zval.value.ast`` union member is set accordingly. + +``IS_INDIRECT`` indicates that the ``zval.value.zv`` member is populated. This field stores a +pointer to some other ``zval``. This type is mainly used in two situations, namely for intermediate +values between ``FETCH`` and ``ASSIGN`` instructions, and for the sharing of variables in the symbol +table. + +.. + _todo: There are many more. + +``IS_PTR`` is used for pointers to arbitrary data. Most commonly, this type is used internally for +``HashTable``, as ``HashTable`` may only store ``zval`` values. For example, ``EG(class_table)`` +represents the class table, which is a hash map of class names to the corresponding +``zend_class_entry``, representing the class. The same goes for functions and many other data types. +``IS_ALIAS_PTR`` is used for class aliases registered via ``class_alias``. Essentially, it just +allows differencing between members in the class table that are aliases, or actual classes. +Otherwise, it is essentially the same as ``IS_PTR``. Arbitrary data is accessed through +``zval.value.ptr``, and casted to the correct type depending on context. If ``ptr`` stores a class +or function, the ``zval.value.ce`` or ``zval.value.func`` fields may be used, respectively. + +``_IS_ERROR`` is used as an error value for some `object handlers `__. It is described in more +detail in its own chapter. + +.. code:: c + + /* Fake types used only for type hinting. + * These are allowed to overlap with the types below. */ + #define IS_CALLABLE 12 + #define IS_ITERABLE 13 + #define IS_VOID 14 + #define IS_STATIC 15 + #define IS_MIXED 16 + #define IS_NEVER 17 + + /* used for casts */ + #define _IS_BOOL 18 + #define _IS_NUMBER 19 + +These flags are never actually stored in ``zval.u1``. They are used for type hinting and in the +`object handler `__ API. + +This only leaves the ``zval.value.ww`` field. In short, this field is used on 32-bit platforms when +copying data from one ``zval`` to another. Normally, ``zval.value.counted`` is copied as a generic +value, no matter what the actual underlying type is. ``zend_value`` always consists of 8 bytes due +to the ``double`` field. Pointers, however, consist only of 4. Because we would otherwise miss the +other 4 bytes, they are copied manually using ``z->value.ww.w2 = _w2;``. This happens in the +``ZVAL_COPY_VALUE_EX`` macro, you won't ever have to care about this. diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 000000000000..f80004227538 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,58 @@ +############## + php-src docs +############## + +.. toctree:: + :caption: Introduction + :hidden: + + introduction/high-level-overview + +.. toctree:: + :caption: Core + :hidden: + + core/data-structures/index + +Welcome to the php-src documentation! + +.. warning:: + + This documentation is work in progress. + + At this point in time, there are other guides that provide a more complete picture of the PHP + project. Check the `CONTRIBUTING.md + `__ file for a + list of technical resources. + +php-src is the canonical implementation of the interpreter for the PHP programming language, as well +as various extensions that provide common functionality. This documentation is intended to help you +understand how the interpreter works, how you can build and test changes, and how you can create +extensions yourself. + +This documentation is not intended to be comprehensive, but is meant to explain core concepts that +are not easy to grasp by reading code alone. It describes best practices, and will frequently omit +APIs that are discouraged for general use. + +****************** + How to get help? +****************** + +Getting started with a new and complicated project like php-src can be overwhelming. While there's +no way around reading lots and lots of code, asking questions of somebody with experience can save a +lot of time. Luckily, many core developers are eager to help. Here are some ways you can get in +touch. + +- `Discord `__ (``#php-internals`` channel) +- `R11 on StackOverflow `__ + +*************** + Prerequisites +*************** + +The php-src interpreter is written in C, and so are most of the bundled extensions. While extensions +may also be written in C++, ext-intl is currently the only bundled extension to do so. It is +advisable that you have *some* knowledge of C before jumping into php-src. + +It is also advisable to get familiar with the semantics of PHP itself, so that you may better +differentiate between bugs and expected behavior, and model new language features. diff --git a/docs/source/introduction/high-level-overview.rst b/docs/source/introduction/high-level-overview.rst new file mode 100644 index 000000000000..1240bed4c0e6 --- /dev/null +++ b/docs/source/introduction/high-level-overview.rst @@ -0,0 +1,194 @@ +##################### + High-level overview +##################### + +PHP is an interpreted language. Interpreted languages differ from compiled ones in that they aren't +compiled into machine-readable code ahead of time. Instead, the source files are read, processed and +interpreted when the program is executed. This can be very convenient for developers for rapid +prototyping, as it skips a lengthy compilation phase. However, it also poses some unique challenges +to performance, which is one of the primary reasons interpreters can be complex. php-src borrows +many concepts from other compilers and interpreters. + +********** + Pipeline +********** + +The goal of the interpreter is to read the users source files, and to simulate the users intent. +This process can be split into distinct phases that are easier to understand and implement. + +- Tokenization - splitting whole source files into words, called tokens. +- Parsing - building a tree structure from tokens, called AST (abstract syntax tree). +- Compilation - traversing the AST and building a list of operations, called opcodes. +- Interpretation - reading and executing opcodes. + +php-src as a whole can be seen as a pipeline consisting of these stages, using the input of the +previous phase and producing some output for the next. + +.. code:: haskell + + source_code + |> tokenizer -- tokens + |> parser -- ast + |> compiler -- opcodes + |> interpreter + +Let's go into each phase in a bit more detail. + +************** + Tokenization +************** + +Tokenization, often called "lexing" or "scanning", is the process of taking an entire program file +and splitting it into a list of words and symbols. Tokens generally consist of a type, a simple +integer constant representing the token, and a lexeme, the literal string used in the source code. + +.. code:: php + + if ($cond) { + echo "Cond is true\n"; + } + +.. code:: text + + T_IF "if" + T_WHITESPACE " " + "(" + T_VARIABLE "$cond" + ")" + T_WHITESPACE " " + "{" + T_WHITESPACE "\n " + T_ECHO "echo" + T_WHITESPACE " " + T_CONSTANT_ENCAPSED_STRING '"Cond is true\n"' + ";" + T_WHITESPACE "\n" + "}" + +While tokenizers are not difficult to write by hand, PHP uses a tool called ``re2c`` to automate +this process. It takes a definition file and generates efficient C code to build these tokens from a +stream of characters. The definition for PHP lives in ``Zend/zend_language_scanner.l``. Check the +`re2c documentation`_ for details. + +.. _re2c documentation: https://re2c.org/ + +********* + Parsing +********* + +Parsing is the process of reading the tokens generated from the tokenizer and building a tree +structure from it. To humans, how source code elements are grouped seems obvious through whitespace +and the usage of symbols like ``()`` and ``{}``. However, computers cannot visually glance over the +code to determine these boundaries quickly. To make it easier and faster to work with, we build a +tree structure from the tokens to more closely reflect the source code the way humans see it. + +Here is a simplified example of what an AST from the tokens above might look like. + +.. code:: text + + ZEND_AST_IF { + ZEND_AST_IF_ELEM { + ZEND_AST_VAR { + ZEND_AST_ZVAL { "cond" }, + }, + ZEND_AST_STMT_LIST { + ZEND_AST_ECHO { + ZEND_AST_ZVAL { "Cond is true\n" }, + }, + }, + }, + } + +Each AST node has a type and may have children. They also store their original position in the +source code, and may define some arbitrary flags. These are omitted for brevity. + +Like with tokenization, we use a tool called ``Bison`` to generate the parser implementation from a +grammar specification. The grammar lives in the ``Zend/zend_language_parser.y`` file. Check the +`Bison documentation`_ for details. Luckily, the syntax is quite approachable. + +.. _bison documentation: https://www.gnu.org/software/bison/manual/ + +Parsing is described in more detail in its `dedicated chapter `__. + +************* + Compilation +************* + +Computers don't understand human language, or even programming languages. They only understand +machine code, which are sequences of simple, mostly atomic instructions for doing one thing. For +example, they may add two numbers, load some memory from RAM, jump to an instruction under a certain +condition, etc. It turns out that even the most complex expressions can be reduced to a number of +these simple instructions. + +PHP is a bit different, in that it does not execute machine code directly. Instead, instructions run +on a "virtual machine", often abbreviated to VM. This is just a fancy way of saying that there is no +physical machine you can buy that understands these instructions, but that this machine is +implemented in software. This is our interpreter. This also means that we are free to make up +instructions ourselves at will. Some of these instructions look very similar to something you'd find +in an actual CPU instruction set (e.g. adding two numbers), while others are much more high-level +(e.g. load property of object by name). + +With that little detour out of the way, the job of the compiler is to read the AST and translate it +into our virtual machine instructions, also called opcodes. The code responsible for this +transformation lives in ``Zend/zend_compile.c``. It essentially traverses the AST and generates a +number of instructions, before going to the next node. + +Here's what the surprisingly compact opcodes for the AST above might look like: + +.. code:: text + + 0000 JMPZ CV0($cond) 0002 + 0001 ECHO string("Cond is true\n") + 0002 RETURN int(1) + +**************** + Interpretation +**************** + +Finally, the opcodes are read and executed by the interpreter. PHPs uses `three-address code`_ for +instructions. This essentially means that each instructions may have a result value, and at most two +operands. Most modern CPUs also use this format. Both result and operands in PHP are :doc:`zvals +<../core/data-structures/zval>`. + +.. _three-address code: https://en.wikipedia.org/wiki/Three-address_code + +How exactly each opcode behaves depends on its purpose. You can find a complete list of opcodes in +the generated ``Zend/zend_vm_opcodes.h`` file. The behavior of each instruction is defined in +``Zend/zend_vm_def.h``. + +Let's step through the opcodes form the example above: + +- We start at the top, i.e. ``JMPZ``. If its first operand contains a "falsy" value, it will jump + to the instruction encoded in its second operand. If it is truthy, it will simply fall-through to + the next instruction. + +- The ``ECHO`` instruction prints its first operand. + +- The ``RETURN`` operand terminates the current function. + +With these simple rules, we can see that the interpreter will ``echo`` only when ``$cond`` is +truthy, and skip over the ``echo`` otherwise. + +That's it! This is how PHP works, fundamentally. Of course, we skipped over a ton of details. The VM +is quite complex, and will be discussed separately in the `virtual machine `__ chapter. + +********* + Opcache +********* + +As you may imagine, running this whole pipeline every time PHP serves a request is time consuming. +Luckily, it is also not necessary. We can cache the opcodes in memory between requests, to skip over +all of the phases, except for the execution phase. This is precisely what the opcache extension +does. It lives in the ``ext/opcache`` directory. + +Opcache also performs some optimizations on the opcodes before caching them. As opcaches are +expected to be reused many times, it is profitable to spend some additional time simplifying them if +possible to improve performance during execution. The optimizer lives in ``Zend/Optimizer``. + +JIT +=== + +The opcache also implements a JIT compiler, which stands for just-in-time compiler. This compiler +takes the virtual PHP opcodes and turns it into actual machine instructions, with additional +information gained at runtime. JITs are very complex pieces of software, so this book will likely +barely scratch the surface of how it works. It lives in ``ext/opcache/jit``.