数组实现方式：PHP5 VS PHP7

本篇文章带大家来比较一下php5和php7的数组实现方式，看看它们之间的差异！
从 php 5 到 php 7 ，php 通过对 hashtable 数据结构和实现方式的修改，使得数组在内存占用和性能上有了很大的提升。
⒈ 数据结构
// php 5 中 hashtable 的数据结构定义typedef struct bucket { ulong h; /*对于索引数组，存储 key 的原始值；对于关联数组，存储 key 的 hash 之后的值*/ uint nkeylength; /*关联数组时存储 key 的长度，索引数组此值为 0*/ void *pdata; /*指向数组 value 的地址*/ void *pdataptr; /*如果 value 为指针，则由 pdataptr 记录 vlaue，pdata 则指向 pdataptr*/ // php 5 中数组元素的顺序是固定的，无论什么时候遍历，数组元素总是与插入时的顺序一致 // php 5 中使用双向链表来保证数组元素的顺序，plistnext 和 plistlast 分别按照 // 元素插入顺序记录当前 bucket 的下一个和上一个 bucket struct bucket *plistnext; struct bucket *plistlast; // php 5 使用拉链法解决 hash 碰撞，pnext 和 plast 分别存储当前 bucket // 在冲突的双向链表中的下一个和上一个相邻的 bucket struct bucket *pnext; struct bucket *plast; const char *arkey; /*关联数组是存储 key 的原始值*/} bucket;typedef struct _hashtable { uint ntablesize; /*当前 ht 所分配的 bucket 的总数，2^n*/ uint ntablemask; /*ntablesize - 1，用于计算索引*/ uint nnumofelements; /*实际存储的元素的数量*/ ulong nnextfreeelement; /*下一个可以被使用的整数 key*/ bucket *pinternalpointer; /*数组遍历时，记录当前 bucket 的地址*/ bucket *plisthead; bucket *plisttail; bucket **arbuckets; /*记录 bucket 的 c 语言数组*/ dtor_func_t pdestructor; /*删除数组元素时内部调用的函数*/ zend_bool persistent; /*标识 ht 是否永久有效*/ unsigned char napplycount; /*ht 允许的最大递归深度*/ zend_bool bapplyprotection; /*是否启用递归保护*/#if zend_debug int inconsistent;#endif} hashtable;// php 7 中 hashtable 的数据结构// php 7 中个子版本以及阶段版本中对 hashtable 的数据结构的定义会有微小的差别，这里使用的是 php 7.4.0 中的定义 struct _zend_string { zend_refcounted_h gc; zend_ulong h; /*字符串 key 的 hash 值*/ size_t len; /*字符串 key 的长度*/ char val[1]; /*存储字符串的值，利用了 struct hack*/};typedef struct _bucket { zval val; /*内嵌 zval 结构，存储数组的 value 值*/ zend_ulong h; /* hash value (or numeric index) */ zend_string *key; /* string key or null for numerics */} bucket;typedef struct _zend_array hashtable;struct _zend_array { zend_refcounted_h gc; union { struct { zend_endian_lohi_4( zend_uchar flags, zend_uchar _unused, zend_uchar niteratorscount, zend_uchar _unused2) } v; uint32_t flags; } u; uint32_t ntablemask; /*作用与 php 5 中 hashtable 中 ntablemask 作用相同，但实现逻辑稍有变化*/ bucket *ardata; /*存储 bucket 相关的信息*/ uint32_t nnumused; /*ht 中已经使用的 bucket 的数量，在 nnumofelements 的基础上加上删除的 key*/ uint32_t nnumofelements; uint32_t ntablesize; uint32_t ninternalpointer; zend_long nnextfreeelement; dtor_func_t pdestructor;};
不考虑其他开销，单从 bucket 所占用的空间来看：在 php 5 中，考虑到内存对齐，一个 bucket 占用的空间为 72 字节；在 php 7 中，一个 zend_value 占 8 字节，一个 zval 占 16 字节，一个 bucket 占 32 字节。相比之下，php 7 中 bucket 的内存空间消耗比 php 5 低了一半以上。
具体 php 5 数组的内存消耗情况，之前的文章已有讲解，这里不再赘述
现在来谈谈 bucket 的存储：在 php 5 中，arbucket 是一个 c 语言数组，长度为 ntablesize，存储的是指向 bucket 的指针，发生 hash 碰撞的 bucket 以双向链表的方式连接。
在 php 7 中，bucket 按照数组元素写入的顺序依次存储，其索引值为 idx，该值存储在 *ardata 左侧的映射区域中。idx 在映射区域中的索引为 nindex，nindex 值为负数，由数组 key 的 hash 值与 ntablemask 进行或运算得到。
// ntablemask 为 -2 倍的 ntablesize 的无符号表示#define ht_size_to_mask(ntablesize) \ ((uint32_t)(-((ntablesize) + (ntablesize))))// 在通过 idx 查找 bucket 时，data 默认为 bucket 类型，加 idx 表示向右偏移 idx 个 bucket 位置# define ht_hash_to_bucket_ex(data, idx) \ ((data) + (idx))// 在通过 nindex 查找 idx 时，// (uint32_t*)(data) 首先将 data 转换成了 uint32_t* 类型的数组// 然后将 nindex 转换成有符号数（负数），然后以数组的方式查找 idx 的值#define ht_hash_ex(data, idx) \ ((uint32_t*)(data))[(int32_t)(idx)]nindex = h | ht->ntablemask;idx = ht_hash_ex(ardata, nindex);p = ht_hash_to_bucket_ex(ardata, idx);
这里需要指出，ntablemask 之所以设置为 ntablesize 的两倍，是这样在计算 nindex 时可以减小 hash 碰撞的概率。
⒉ 添加/修改元素
php 5 先来谈谈 php 5 中数组元素的添加和修改，由于 php 5 中数组元素的插入顺序以及 hash 碰撞都是通过双向链表的方式来维护，所以虽然实现起来有些复杂，但理解起来相对容易一些。
// hash 碰撞双向链表的维护#define connect_to_bucket_dllist(element, list_head) \ (element)->pnext = (list_head); \ (element)->plast = null; \ if ((element)->pnext) { \ (element)->pnext->plast = (element); \ }#define connect_to_global_dllist_ex(element, ht, last, next)\ (element)->plistlast = (last); \ (element)->plistnext = (next); \ if ((last) != null) { \ (last)->plistnext = (element); \ } else { \ (ht)->plisthead = (element); \ } \ if ((next) != null) { \ (next)->plistlast = (element); \ } else { \ (ht)->plisttail = (element); \ } \// 数组元素插入顺序双向链表的维护#define connect_to_global_dllist(element, ht) \ connect_to_global_dllist_ex(element, ht, (ht)->plisttail, (bucket *) null); \ if ((ht)->pinternalpointer == null) { \ (ht)->pinternalpointer = (element); \ }// 数组元素的更新#define update_data(ht, p, pdata, ndatasize) \ if (ndatasize == sizeof(void*)) { \ // 值为指针类型的元素的更新 \ if ((p)->pdata != &(p)->pdataptr) { \ pefree_rel((p)->pdata, (ht)->persistent); \ } \ // pdataptr 存储元素值的地址，pdata 存储 pdataptr 的地址 \ memcpy(&(p)->pdataptr, pdata, sizeof(void *)); \ (p)->pdata = &(p)->pdataptr; \ } else { \ // 如果数组元素为值类型，则存入 pdata，此时 pdataptr 为 null \ if ((p)->pdata == &(p)->pdataptr) { \ (p)->pdata = (void *) pemalloc_rel(ndatasize, (ht)->persistent); \ (p)->pdataptr=null; \ } else { \ (p)->pdata = (void *) perealloc_rel((p)->pdata, ndatasize, (ht)->persistent); \ /* (p)->pdataptr is already null so no need to initialize it */ \ } \ memcpy((p)->pdata, pdata, ndatasize); \ }// 数组元素的初始化#define init_data(ht, p, _pdata, ndatasize); \ if (ndatasize == sizeof(void*)) { \ // 指针类型元素的初始化 \ memcpy(&(p)->pdataptr, (_pdata), sizeof(void *)); \ (p)->pdata = &(p)->pdataptr; \ } else { \ // 值类型元素的初始化 \ (p)->pdata = (void *) pemalloc_rel(ndatasize, (ht)->persistent);\ memcpy((p)->pdata, (_pdata), ndatasize); \ (p)->pdataptr=null; \ }// hashtable 初始化校验，如果没有初始化，则初始化 hashtable#define check_init(ht) do { \ if (unexpected((ht)->ntablemask == 0)) { \ (ht)->arbuckets = (bucket **) pecalloc((ht)->ntablesize, sizeof(bucket *), (ht)->persistent); \ (ht)->ntablemask = (ht)->ntablesize - 1; \ } \} while (0)// 数组元素的新增或更新（精简掉了一些宏调用和代码片段)zend_api int _zend_hash_add_or_update(hashtable *ht, const char *arkey, uint nkeylength, void *pdata, uint ndatasize, void **pdest, int flag zend_file_line_dc){ ulong h; uint nindex; bucket *p; check_init(ht); h = zend_inline_hash_func(arkey, nkeylength); nindex = h & ht->ntablemask; p = ht->arbuckets[nindex]; while (p != null) { if (p->arkey == arkey || ((p->h == h) && (p->nkeylength == nkeylength) && !memcmp(p->arkey, arkey, nkeylength))) { // 数组元素更新逻辑 if (flag & hash_add) { return failure; } zend_assert(p->pdata != pdata); if (ht->pdestructor) { ht->pdestructor(p->pdata); } update_data(ht, p, pdata, ndatasize); if (pdest) { *pdest = p->pdata; } return success; } p = p->pnext; } // 数组元素新增逻辑 if (is_interned(arkey)) { p = (bucket *) pemalloc(sizeof(bucket), ht->persistent); p->arkey = arkey; } else { p = (bucket *) pemalloc(sizeof(bucket) + nkeylength, ht->persistent); p->arkey = (const char*)(p + 1); memcpy((char*)p->arkey, arkey, nkeylength); } p->nkeylength = nkeylength; init_data(ht, p, pdata, ndatasize); p->h = h; // hash 碰撞链表维护 connect_to_bucket_dllist(p, ht->arbuckets[nindex]); if (pdest) { *pdest = p->pdata; } // 数组元素写入顺序维护 connect_to_global_dllist(p, ht); ht->arbuckets[nindex] = p; ht->nnumofelements++; zend_hash_if_full_do_resize(ht); /* if the hash table is full, resize it */ return success;}
php 5 中的数组在新增或修改元素时，首先会根据给定的 key 计算得到相应的 hash 值，然后据此得到 arbuckets 的索引 nindex，最终得到链表中第一个 bucket（ hash 碰撞链表的表头），即p。
如果是更新数组中已有的项，那么会从 p 开始遍历 hash 碰撞链表，直到找到 arkey 与给定的 key 相同的 bucket，然后更新 pdata。
如果是向数组中新增项，首先会判断给定的 key 是否为 interned string 类型，如果是，那么只需要为 bucket 申请内存，然后将 p->arkey 指向给定的 key 的地址即可，否则在为新的 bucket 申请内存的同时还需要为给定的 key 申请内存，然后将 p->arkey 指向为 key 申请的内存的地址。之后会对新申请的 bucket 进行初始化，最后要做的两件事：维护 hash 碰撞链表和数组元素写入顺序链表。在维护 hash 碰撞的链表时，新增的 bucket 是放在链表头的位置；维护数组元素写入顺序的链表时，新增的 bucket 是放在链表的末尾，同时将 hashtable 的 plisttail 指向新增的 bucket。
关于 php 中的 interned string，之前在讲解 php 7 对字符串处理逻辑优化的时候已经说明，这里不再赘述
php 7 php 7 在 hashtable 的数据结构上做了比较大的改动，同时放弃了使用双向链表的方式来维护 hash 碰撞和数组元素的写入顺序，在内存管理以及性能上得到了提升，但理解起来却不如 php 5 中的实现方式直观。
#define z_next(zval) (zval).u2.next#define ht_hash_ex(data, idx) \ ((uint32_t*)(data))[(int32_t)(idx)]# define ht_idx_to_hash(idx) \ ((idx) * sizeof(bucket))// php 7 中数组添加/修改元素（精简了部分代码）static zend_always_inline zval *_zend_hash_add_or_update_i(hashtable *ht, zend_string *key, zval *pdata, uint32_t flag){ zend_ulong h; uint32_t nindex; uint32_t idx; bucket *p, *ardata; /*... ...*/ zend_hash_if_full_do_resize(ht); /* if the hash table is full, resize it */add_to_hash: idx = ht->nnumused++; ht->nnumofelements++; ardata = ht->ardata; p = ardata + idx; p->key = key; p->h = h = zstr_h(key); nindex = h | ht->ntablemask; z_next(p->val) = ht_hash_ex(ardata, nindex); ht_hash_ex(ardata, nindex) = ht_idx_to_hash(idx); zval_copy_value(&p->val, pdata); return &p->val;}
这里需要先说明一下 nnumused 和 nnumofelements 的区别：
按图中示例，此时 nnumused 的值应该为 5，但 nnumofelements 的值则应该为 3。在 php 7 中，数组元素按照写入顺序依次存储，而 nnumused 正好可以用来充当数组元素存储位置索引的功能。
另外就是 p = ardata + idx ，前面已经讲过 ardata 为 bucket 类型，这里 +idx 意为指针从 ardata 的位置开始向右偏移 idx 个 bucket 的位置。宏调用 ht_hash_ex 也是同样的道理。
最后就是 z_next(p->val)，php 7 中的 bucket 结构都内嵌了一个 zval，zval 中的联合体 u2 中有一项 next 用来记录hash 碰撞的信息。nindex 用来标识 idx 在映射表中的位置，在往 hashtable 中新增元素时，如果根据给定的 key 计算得到的 nindex 的位置已经有值（即发生了 hash 碰撞），那么此时需要将 nindex 所指向的位置的原值记录到新增的元素所对应的 bucket 下的 val.u2.next 中。宏调用 ht_idx_to_hash 的作用是根据 idx 计算得到 bucket 的以字节为单位的偏移量。
⒊ 删除元素
php 5 在 php 5 中，数组元素的删除过程中的主要工作是维护 hash 碰撞链表和数组元素写入顺序的链表。
// 删除 bucket 的代码（精简了部分代码片段）static zend_always_inline void i_zend_hash_bucket_delete(hashtable *ht, bucket *p){ if (p->plast) { p->plast->pnext = p->pnext; } else { ht->arbuckets[p->h & ht->ntablemask] = p->pnext; } if (p->pnext) { p->pnext->plast = p->plast; } if (p->plistlast != null) { p->plistlast->plistnext = p->plistnext; } else { /* deleting the head of the list */ ht->plisthead = p->plistnext; } if (p->plistnext != null) { p->plistnext->plistlast = p->plistlast; } else { /* deleting the tail of the list */ ht->plisttail = p->plistlast; } if (ht->pinternalpointer == p) { ht->pinternalpointer = p->plistnext; } ht->nnumofelements--; if (ht->pdestructor) { ht->pdestructor(p->pdata); } if (p->pdata != &p->pdataptr) { pefree(p->pdata, ht->persistent); } pefree(p, ht->persistent);}// 元素删除zend_api int zend_hash_del_key_or_index(hashtable *ht, const char *arkey, uint nkeylength, ulong h, int flag){ uint nindex; bucket *p; if (flag == hash_del_key) { h = zend_inline_hash_func(arkey, nkeylength); } nindex = h & ht->ntablemask; p = ht->arbuckets[nindex]; while (p != null) { if ((p->h == h) && (p->nkeylength == nkeylength) && ((p->nkeylength == 0) /* numeric index (short circuits the memcmp() check) */ || !memcmp(p->arkey, arkey, nkeylength))) { /* string index */ i_zend_hash_bucket_delete(ht, p); return success; } p = p->pnext; } return failure;}
php 5 中数组在删除元素时，仍然是先根据给定的 key 计算 hash，然后找到 arbucket 的 nindex，最终找到需要删除的 bucket 所在的 hash 碰撞的链表，通过遍历链表，找到最终需要删除的 bucket。
在实际删除 bucket 的过程中，主要做的就是维护两个链表：hash 碰撞链表和数组元素写入顺序链表。再就是释放内存。
php 7 由于 php 7 记录 hash 碰撞信息的方式发生了变化，所以在删除元素时处理 hash 碰撞链表的逻辑也会有所不同。另外，在删除元素时，还有可能会遇到空间回收的情况。
#define is_undef 0#define z_type_info(zval) (zval).u1.type_info#define z_type_info_p(zval_p) z_type_info(*(zval_p))#define zval_undef(z) do { \ z_type_info_p(z) = is_undef; \ } while (0) static zend_always_inline void _zend_hash_del_el_ex(hashtable *ht, uint32_t idx, bucket *p, bucket *prev){ // 从 hash 碰撞链表中删除指定的 bucket if (!(ht_flags(ht) & hash_flag_packed)) { if (prev) { z_next(prev->val) = z_next(p->val); } else { ht_hash(ht, p->h | ht->ntablemask) = z_next(p->val); } } idx = ht_hash_to_idx(idx); ht->nnumofelements--; if (ht->ninternalpointer == idx || unexpected(ht_has_iterators(ht))) { // 如果当前 hashtable 的内部指针指向了要删除的 bucket 或当前 hashtable 有遍历 // 操作，那么需要避开当前正在被删除的 bucket uint32_t new_idx; new_idx = idx; while (1) { new_idx++; if (new_idx >= ht->nnumused) { break; } else if (z_type(ht->ardata[new_idx].val) != is_undef) { break; } } if (ht->ninternalpointer == idx) { ht->ninternalpointer = new_idx; } zend_hash_iterators_update(ht, idx, new_idx); } if (ht->nnumused - 1 == idx) { //如果被删除的 bucket 在数组的末尾，则同时回收与 bucket 相邻的已经被删除的 bucket 的空间 do { ht->nnumused--; } while (ht->nnumused > 0 && (unexpected(z_type(ht->ardata[ht->nnumused-1].val) == is_undef))); ht->ninternalpointer = min(ht->ninternalpointer, ht->nnumused); } if (p->key) { // 删除 string 类型的索引 zend_string_release(p->key); } // 删除 bucket if (ht->pdestructor) { zval tmp; zval_copy_value(&tmp, &p->val); zval_undef(&p->val); ht->pdestructor(&tmp); } else { zval_undef(&p->val); }}static zend_always_inline void _zend_hash_del_el(hashtable *ht, uint32_t idx, bucket *p){ bucket *prev = null; if (!(ht_flags(ht) & hash_flag_packed)) { // 如果被删除的 bucket 存在 hash 碰撞的情况，那么需要找出其在 hash 碰撞链表中的位置 uint32_t nindex = p->h | ht->ntablemask; uint32_t i = ht_hash(ht, nindex); if (i != idx) { prev = ht_hash_to_bucket(ht, i); while (z_next(prev->val) != idx) { i = z_next(prev->val); prev = ht_hash_to_bucket(ht, i); } } } _zend_hash_del_el_ex(ht, idx, p, prev);}zend_api void zend_fastcall zend_hash_del_bucket(hashtable *ht, bucket *p){ is_consistent(ht); ht_assert_rc1(ht); _zend_hash_del_el(ht, ht_idx_to_hash(p - ht->ardata), p);}
php 7 中数组元素的删除，其最终目的是删除指定的 bucket。在删除 bucket 时还需要处理好 hash 碰撞链表维护的问题。由于 php 7 中 hash 碰撞只维护了一个单向链表（通过 bucket.val.u2.next 来维护），所以在删除 bucket 时还需要找出 hash 碰撞链表中的前一项 prev。最后，在删除 bucket 时如果当前的 hashtable 的内部指针（ninternalpointer）正好指向了要删除的 bucket 或存在遍历操作，那么需要改变内部指针的指向，同时在遍历时跳过要删除的 bucket。另外需要指出的是，并不是每一次删除 bucket 的操作都会回收相应的内存空间，通常删除 bucket 只是将其中 val 的类型标记为 is_undef，只有在扩容或要删除的 bucket 为最后一项并且相邻的 bucket 为 is_undef 时才会回收其内存空间。
⒋ 数组遍历
php 5 由于 php 5 中有专门用来记录数组元素写入顺序的双向链表，所以数组的遍历逻辑相对比较简单。
// 数组的正向遍历zend_api int zend_hash_move_forward_ex(hashtable *ht, hashposition *pos){ hashposition *current = pos ? pos : &ht->pinternalpointer; is_consistent(ht); if (*current) { *current = (*current)->plistnext; return success; } else return failure;}// 数组的反向遍历zend_api int zend_hash_move_backwards_ex(hashtable *ht, hashposition *pos){ hashposition *current = pos ? pos : &ht->pinternalpointer; is_consistent(ht); if (*current) { *current = (*current)->plistlast; return success; } else return failure;}
&emsp； php 5 中 hashtable 的数据结构中有三个字段：pinternalpointer 用来记录数组遍历过程中指针指向的当前 bucket 的地址；plisthead 用来记录保存数组元素写入顺序的双向链表的表头；plisttail 用来记录保存数组元素写入顺序的双向链表的表尾。数组的正向遍历从 plisthead 的位置开始，通过不断更新 pinternalpointer 来实现；反向遍历从 plisttail 开始，通过不断更新 pinternalpointer 来实现。
php 7 由于 php 7 中数组的元素是按照写入的顺序存储，所以遍历的逻辑相对简单，只是在遍历过程中需要跳过被标记为 is_undef 的项。
⒌ hash 碰撞
php 5 前面在谈论数组元素添加/修改的时候已有提及，每次在数组新增元素时，都会检查并处理 hash 碰撞，即 connect_to_bucket_dllist，代码如下
connect_to_bucket_dllist(p, ht->arbuckets[nindex]);#define connect_to_bucket_dllist(element, list_head) \ (element)->pnext = (list_head); \ (element)->plast = null; \ if ((element)->pnext) { \ (element)->pnext->plast = (element); \ }
在新增元素时，如果当前 arbuckets 的位置没有其他元素，那么只需要直接写入新增的 bucket 即可，否则新增的 bucket 会被写入 hash 碰撞双向链表的表头位置。
php 7 前面已经讲过，php 7 中的 hashtable 是通过 bucket 中的 val.u2.next 项来维护 hash 碰撞的单向链表的。所以，在往 hashtable 中添加新的元素时，最后需要先将 nindex 位置的值写入新增的 bucket 的 val.u2.next 中。而在删除 bucket 时，需要同时找出要删除的 bucket 所在的 hash 碰撞链表中的前一项，以便后续的 hash 碰撞链表的维护。
⒍ 扩容
php 5 在数组元素新增/修改的 api 中的最后有一行代码 zend_hash_if_full_do_resize(ht) 来判断当前 hashtable 是否需要扩容，如果需要则对其进行扩容。
// 判断当前 hashtable 是否需要扩容#define zend_hash_if_full_do_resize(ht) \ if ((ht)->nnumofelements > (ht)->ntablesize) { \ zend_hash_do_resize(ht); \ }// hashtable 扩容（精简部分代码）zend_api int zend_hash_do_resize(hashtable *ht){ bucket **t; if ((ht->ntablesize << 1) > 0) { /* let's double the table size */ t = (bucket **) perealloc(ht->arbuckets, (ht->ntablesize << 1) * sizeof(bucket *), ht->persistent); ht->arbuckets = t; ht->ntablesize = (ht->ntablesize << 1); ht->ntablemask = ht->ntablesize - 1; zend_hash_rehash(ht); }}// 扩容后对 hashtable 中的元素进行 rehash（精简部分代码）zend_api int zend_hash_rehash(hashtable *ht){ bucket *p; uint nindex; if (unexpected(ht->nnumofelements == 0)) { return success; } memset(ht->arbuckets, 0, ht->ntablesize * sizeof(bucket *)); for (p = ht->plisthead; p != null; p = p->plistnext) { nindex = p->h & ht->ntablemask; connect_to_bucket_dllist(p, ht->arbuckets[nindex]); ht->arbuckets[nindex] = p; } return success;}
首先，php 5 hashtable 扩容的前提条件：数组中元素的数量超过 hashtable 的 ntablesize 的值。之后，hashtable 的 ntablesize 会翻倍，然后重新为 arbuckets 分配内存空间并且更新 ntablemask 的值。最后，由于 ntablemask 发生变化，需要根据数组元素的索引重新计算 nindex，然后将之前的 bucket 关联到新分配的 arbuckets 中新的位置。
php 7 在 php 7 的新增/修改 hashtable 的 api 中也有判断是否需要扩容的代码 zend_hash_if_full_do_resize(ht)，当满足条件时则会执行扩容操作。
#define ht_size_to_mask(ntablesize) \ ((uint32_t)(-((ntablesize) + (ntablesize))))#define ht_hash_size(ntablemask) \ (((size_t)(uint32_t)-(int32_t)(ntablemask)) * sizeof(uint32_t))#define ht_data_size(ntablesize) \ ((size_t)(ntablesize) * sizeof(bucket))#define ht_size_ex(ntablesize, ntablemask) \ (ht_data_size((ntablesize)) + ht_hash_size((ntablemask)))#define ht_set_data_addr(ht, ptr) do { \ (ht)->ardata = (bucket*)(((char*)(ptr)) + ht_hash_size((ht)->ntablemask)); \ } while (0)#define ht_get_data_addr(ht) \ ((char*)((ht)->ardata) - ht_hash_size((ht)->ntablemask))// 当 hashtable 的 nnumused 大于或等于 ntablesize 时则执行扩容操作#define zend_hash_if_full_do_resize(ht) \ if ((ht)->nnumused >= (ht)->ntablesize) { \ zend_hash_do_resize(ht); \ } # define ht_hash_reset(ht) \ memset(&ht_hash(ht, (ht)->ntablemask), ht_invalid_idx, ht_hash_size((ht)->ntablemask))#define ht_is_without_holes(ht) \ ((ht)->nnumused == (ht)->nnumofelements)// 扩容（精简部分代码）static void zend_fastcall zend_hash_do_resize(hashtable *ht){ if (ht->nnumused > ht->nnumofelements + (ht->nnumofelements >> 5)) { /* additional term is there to amortize the cost of compaction */ zend_hash_rehash(ht); } else if (ht->ntablesize < ht_max_size) { /* let's double the table size */ void *new_data, *old_data = ht_get_data_addr(ht); uint32_t nsize = ht->ntablesize + ht->ntablesize; bucket *old_buckets = ht->ardata; ht->ntablesize = nsize; new_data = pemalloc(ht_size_ex(nsize, ht_size_to_mask(nsize)), gc_flags(ht) & is_array_persistent); ht->ntablemask = ht_size_to_mask(ht->ntablesize); ht_set_data_addr(ht, new_data); memcpy(ht->ardata, old_buckets, sizeof(bucket) * ht->nnumused); pefree(old_data, gc_flags(ht) & is_array_persistent); zend_hash_rehash(ht); } else { zend_error_noreturn(e_error, "possible integer overflow in memory allocation (%u * %zu + %zu)", ht->ntablesize * 2, sizeof(bucket) + sizeof(uint32_t), sizeof(bucket)); }}// rehash（精简部分代码）zend_api int zend_fastcall zend_hash_rehash(hashtable *ht){ bucket *p; uint32_t nindex, i; if (unexpected(ht->nnumofelements == 0)) { if (!(ht_flags(ht) & hash_flag_uninitialized)) { ht->nnumused = 0; ht_hash_reset(ht); } return success; } ht_hash_reset(ht); i = 0; p = ht->ardata; if (ht_is_without_holes(ht)) { // bucket 中没有被标记为 is_undef 的项 do { nindex = p->h | ht->ntablemask; z_next(p->val) = ht_hash(ht, nindex); ht_hash(ht, nindex) = ht_idx_to_hash(i); p++; } while (++i < ht->nnumused); } else { // bucket 中有被标记为 is_undef 的项 uint32_t old_num_used = ht->nnumused; do { if (unexpected(z_type(p->val) == is_undef)) { // bucket 中第一项被标记为 is_undef uint32_t j = i; bucket *q = p; if (expected(!ht_has_iterators(ht))) { // hashtable 没有遍历操作 while (++i < ht->nnumused) { p++; if (expected(z_type_info(p->val) != is_undef)) { zval_copy_value(&q->val, &p->val); q->h = p->h; nindex = q->h | ht->ntablemask; q->key = p->key; z_next(q->val) = ht_hash(ht, nindex); ht_hash(ht, nindex) = ht_idx_to_hash(j); if (unexpected(ht->ninternalpointer == i)) { ht->ninternalpointer = j; } q++; j++; } } } else { // hashtable 存在遍历操作 uint32_t iter_pos = zend_hash_iterators_lower_pos(ht, 0); while (++i < ht->nnumused) { p++; if (expected(z_type_info(p->val) != is_undef)) { zval_copy_value(&q->val, &p->val); q->h = p->h; nindex = q->h | ht->ntablemask; q->key = p->key; z_next(q->val) = ht_hash(ht, nindex); ht_hash(ht, nindex) = ht_idx_to_hash(j); if (unexpected(ht->ninternalpointer == i)) { ht->ninternalpointer = j; } if (unexpected(i >= iter_pos)) { do { zend_hash_iterators_update(ht, iter_pos, j); iter_pos = zend_hash_iterators_lower_pos(ht, iter_pos + 1); } while (iter_pos < i); } q++; j++; } } } ht->nnumused = j; break; } nindex = p->h | ht->ntablemask; z_next(p->val) = ht_hash(ht, nindex); ht_hash(ht, nindex) = ht_idx_to_hash(i); p++; } while (++i < ht->nnumused); /* migrate pointer to one past the end of the array to the new one past the end, so that * newly inserted elements are picked up correctly. */ if (unexpected(ht_has_iterators(ht))) { _zend_hash_iterators_update(ht, old_num_used, ht->nnumused); } } return success;}
php 7 中 hashtable 在扩容时也是将 ntablesize 翻倍，然后进行 rehash。在进行 rehash 操作时，如果 bucket 中没有标记为删除的项（is_undef），那么 rehash 操作之后 bucket 的存储顺序不会发生任何变化，只是 idx 索引存储的位置会因为 ntablemask 的变化而变化，最终导致 hash 碰撞链表的变化。如果 bucket 中存在被标记为删除的项，那么在 rehash 的过程中会跳过这些 bucket 项，只保留那些没有被删除的项。同时，由于这样会导致 bucket 的索引相较于原来发生变化，所以在 rehash 的过程中需要同时更新 hashtable 内部指针的信息以及与遍历操作相关的信息。
⒎ php 7 中的 packed hashtable
在 php 7 中，如果一个数组为索引数组，并且数组中的索引为升序排列，那么此时由于 hashtable 中 bucket 按照写入顺序排列，而数组索引也是升序的，所以映射表已经没有必要。php 7 针对这种特殊的情况对 hashtable 做了一些优化 packed hashtable。
#define ht_min_mask ((uint32_t) -2)#define ht_min_size 8#define ht_hash_reset_packed(ht) do { \ ht_hash(ht, -2) = ht_invalid_idx; \ ht_hash(ht, -1) = ht_invalid_idx; \ } while (0)static zend_always_inline void zend_hash_real_init_packed_ex(hashtable *ht){ void *data; if (unexpected(gc_flags(ht) & is_array_persistent)) { data = pemalloc(ht_size_ex(ht->ntablesize, ht_min_mask), 1); } else if (expected(ht->ntablesize == ht_min_size)) { data = emalloc(ht_size_ex(ht_min_size, ht_min_mask)); } else { data = emalloc(ht_size_ex(ht->ntablesize, ht_min_mask)); } ht_set_data_addr(ht, data); /* don't overwrite iterator count. */ ht->u.v.flags = hash_flag_packed | hash_flag_static_keys; ht_hash_reset_packed(ht);}
packed hashtable 在初始化时，ntablemask 的值默认为 -2，同时在 hashtable 的 flags 中会进行相应的标记。如果此时 packed hashtable 中没有任何元素，那么 ntablesize 会设为 0。
static void zend_fastcall zend_hash_packed_grow(hashtable *ht){ ht_assert_rc1(ht); if (ht->ntablesize >= ht_max_size) { zend_error_noreturn(e_error, "possible integer overflow in memory allocation (%u * %zu + %zu)", ht->ntablesize * 2, sizeof(bucket), sizeof(bucket)); } ht->ntablesize += ht->ntablesize; ht_set_data_addr(ht, perealloc2(ht_get_data_addr(ht), ht_size_ex(ht->ntablesize, ht_min_mask), ht_used_size(ht), gc_flags(ht) & is_array_persistent));}
另外，packed hashtable 在扩容时，只需要将 ntablesize 翻倍，同时由于索引是升序排列的，所以 bucket 的顺序不需要做任何调整，只需要重新分配内存空间即可。
需要强调的是，packed hashtable 只适用于索引为升序排列的索引数组（索引不一定要连续，中间可以有间隔）。如果索引数组的索引顺序被破坏，或索引中加入了字符串索引，那么此时 packed hashtable 会被转换为普通的 hashtable。
推荐学习：《php视频教程》
以上就是数组实现方式：php5 vs php7的详细内容。

数组实现方式：PHP5 VS PHP7

推荐信息