Greenplum Python专用库gppylib学习——GpArray

时间:2021-01-12
本文章向大家介绍Greenplum Python专用库gppylib学习——GpArray,主要包括Greenplum Python专用库gppylib学习——GpArray使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。

  gparray.py依赖的python包(datetime、copy、traceback、os),依赖的gp包(gplog、utils、db、gpversion、commands.unix)

1 from datetime import date
2 import copy
3 import traceback
4 from gppylib.utils import checkNotNone, checkIsInt
5 from gppylib    import gplog
6 from gppylib.db import dbconn
7 from gppylib.gpversion import GpVersion
8 from gppylib.commands.unix import *
9 import os

代码分析

QD(Query Dispatcher)包含master和standby master,QE(Query Executor)包含primary和mirror。每个posgres数据库的信息使用GpDB对象表示。Segment对象代表primaryDB和其对应的零个、一个或多个mirrorDB。GpArray对象就是master、standbyMaster和多个Segmnet对象的组合。

 

GpDB类

GpDB类是单个dbid所指的postgres数据库实例的配置信息。其余成员都可以很好地理解,这里说说__filespaces成员是存放key为数据库对象oid,value为其数据库对象的文件目录路径的字典。因此GpDB类构造函数的datadir是SYSTEM_FILESPACE(oid为3052)所处的文件路径。

 1 class GpDB:
 2     def __init__(self, content, preferred_role, dbid, role, mode, status, hostname, address, port, datadir, replicationPort):
 3         self.content=content
 4         self.preferred_role=preferred_role
 5         self.dbid=dbid
 6         self.role=role
 7         self.mode=mode
 8         self.status=status
 9         self.hostname=hostname
10         self.address=address
11         self.port=port
12         self.datadir=datadir
13         self.replicationPort=replicationPort
14         # Filespace mappings for this segment
15         self.__filespaces = { SYSTEM_FILESPACE: datadir }  # SYSTEM_FILESPACE oid of the system filespace 3052
16         # Pending filespace creation
17         self.__pending_filespace = None
18         # Catalog directory for each database in this segment
19         self.catdirs = None
20         # Todo: Remove old dead code
21         self.valid = (status == 'u')
22     def __str__(self):  # 构造GpDB类可打印的字符串表示
23     def __repr__(self):
24         fsOids = [oid for oid in self.__filespaces]   # 取出__filespaces中所有的key,及数据库对象对应的oid
25         fsOids.sort() # sort for determinism
26         filespaces = []
27         for fsoid in fsOids:
28             if fsoid not in [SYSTEM_FILESPACE]:
29                 filespaces.append("%d:%s" % (fsoid, self.__filespaces[fsoid])) # 以oid:datadir字符串为item,放入filespaces
30         return '%d|%d|%s|%s|%s|%s|%s|%s|%d|%s|%s|%s|%s' % (self.dbid,self.content,self.role,self.preferred_role,self.mode,self.status,self.hostname,self.address,self.port,self.replicationPort,self.datadir,','.join(filespaces),','.join(self.catdirs) if self.catdirs else [])    
31     def __cmp__(self,other):  # 使用__reper__函数序列化GpDB对象,并进行比较
32     def equalIgnoringModeAndStatusAndReplicationPort(self, other):  # 如果核心属性(比如filespace)都相同则返回true,该方法在updateSystemConfig函数调用(在移除mirror segment或再添加mirror segmnet时会造成catalog改变)
33     def copy(self):
34     def isSegmentQD(self):
35     def isSegmentMaster(self, current_role=False):
36     ...
37     def isSegmentModeInResynchronization(self):
38     def getSegmentDbId(self):
39     def getSegmentContentId(self):
40     ...
41     def getSegmentFilespaces(self):
42     def setSegmentDbId(self, dbId):
43     def setSegmentContentId(self, contentId):
44     ...
45     def setSegmentDataDirectory(self, dataDirectory):
46     def addSegmentFilespace(self, oid, path):
47     def getSegmentPendingFilespace(self): 
48     @staticmethod  
49     def getDataDirPrefix(datadir):
50         retValue = ""
51         retValue = datadir[:datadir.rfind('/')]
52         return retValue       

成员变量createTemplate函数创建GpDB的信息的模板,第一步确保dstDir有足够的空间存放segment和其filespace(通过fillespaces中存放的oid和dirpath,查询各数据库对象所对应的空间占用大小);第二步获取磁盘空闲空间(DiskFree.get_size_local(name = "Check for available free space for segment template", directory = dstDir));第三步使用LocalDirCopy类对象将segment数据目录拷贝到目标目录dstDir;第四步先判别__filespaces中除了SYSTEM_FILESPACE(oid为3052)之外是否还有其他数据库对象,如果有,先判别dstDir + "/fs_directory"目录是否存在,不断将fillespaces中存放的dirpath中的目录在目标路径进行创建,数据库对象文件进行拷贝;第五步,删除目标路径下的gp_dbid文件(dstDir + ‘/gp_dbid’),对dstDir设置0700权限。

 1     def createTemplate(self, dstDir):
 2         # Make sure we have enough room in the dstDir to fit the segment and its filespaces.
 3         duCmd = DiskUsage(name = "srcDir", directory = dstDir)
 4         duCmd.run(validateAfter=True)
 5         requiredSize = duCmd.get_bytes_used()
 6         name = "segcopy filespace get_size"
 7         for oid in self.__filespaces:
 8             if oid == SYSTEM_FILESPACE:
 9                 continue
10             dir = self.__filespaces[oid]
11             duCmd = DiskUsage(name, dir)
12             duCmd.run(validateAfter=True)
13             size = duCmd.get_bytes_used()
14             requiredSize = requiredSize + size
15         dstBytesAvail = DiskFree.get_size_local(name = "Check for available free space for segment template", directory = dstDir)
16         if dstBytesAvail <= requiredSize:
17             raise Exception("Not enough space on directory: '%s'.  Currently %d bytes free but need %d bytes." % (dstDir, int(dstBytesAvail), int(requiredSize)))
18         logger.info("Starting copy of segment dbid %d to location %s" % (int(self.getSegmentDbId()), dstDir))
19         cpCmd = LocalDirCopy("Copy system data directory", self.getSegmentDataDirectory(), dstDir)
20         cpCmd.run(validateAfter = True)
21         res = cpCmd.get_results()
22         if len(self.__filespaces) > 1:
23             """ Make directory to hold file spaces """
24             fullPathFsDir = dstDir + "/" +  DESTINATION_FILE_SPACES_DIRECTORY # DESTINATION_FILE_SPACES_DIRECTORY = "fs_directory"
25             cmd = FileDirExists( name = "check for existance of template filespace directory", directory = fullPathFsDir)
26             cmd.run(validateAfter = True)
27             MakeDirectory.local("gpexpand make directory to hold file spaces", fullPathFsDir)
28             for oid in self.__filespaces:
29                 MakeDirectory.local("gpexpand make directory to hold file space oid: " + str(oid), fullPathFsDir)
30                 dir = self.__filespaces[oid]
31                 destDir = fullPathFsDir + "/" + str(oid)
32                 MakeDirectory.local("gpexpand make directory to hold file space: " + destDir, destDir)
33                 name = "GpSegCopy %s to %s" % (dir, destDir)
34                 cpCmd = LocalDirCopy(name, dir, destDir)
35                 cpCmd.run(validateAfter = True)
36                 res = cpCmd.get_results()
37             # Remove the gp_dbid file from the data dir
38             RemoveFile.local('Remove gp_dbid file', os.path.normpath(dstDir + '/gp_dbid'))
39             logger.info("Cleaning up catalog for schema only copy on destination")
40             # We need 700 permissions or postgres won't start
41             Chmod.local('set template permissions', dstDir, '0700')

静态成员函数initFromString(s)为工厂函数,从字符串中初始化GpDB对象,该字符串和repr()输出兼容。

 1     @staticmethod
 2     def initFromString(s):
 3         tup = s.strip().split('|')
 4         # Old format: 8 fields    Todo: remove the need for this, or rework it to be cleaner
 5         if len(tup) == 8:
 6             # This describes the gp_configuration catalog (pre 3.4)
 7             content         = int(tup[0])
 8             ...
 9             datadir         = tup[7]
10             # Calculate new fields from old ones           
11             # Note: this should be kept in sync with the code in
12             # GpArray.InitFromCatalog() code for initializing old catalog
13             # formats.
14             preferred_role  = ROLE_PRIMARY if definedprimary else ROLE_MIRROR
15             role            = ROLE_PRIMARY if isprimary else ROLE_MIRROR
16             hostname        = None
17             mode            = MODE_SYNCHRONIZED       # ???
18             status          = STATUS_UP if valid else STATUS_DOWN
19             replicationPort = None
20             filespaces      = ""
21             catdirs         = ""
22         # Catalog 3.4 format: 12 fields
23         elif len(tup) == 12:
24             # This describes the gp_segment_configuration catalog (3.4)
25             dbid            = int(tup[0])
26             ...     
27             catdirs         = ""
28         # Catalog 4.0+: 13 fields
29         elif len(tup) == 13:
30             # This describes the gp_segment_configuration catalog (3.4+)
31             dbid            = int(tup[0])
32             ...
33             catdirs         = tup[12]
34         else:
35             raise Exception("GpDB unknown input format: %s" % s)
36         # Initialize segment without filespace information
37         gpdb = GpDB(content=content,preferred_role=preferred_role,dbid=dbid,role=role,mode=mode,status=status,hostname=hostname,address=address,port=port,datadir=datadir,replicationPort=replicationPort)
38         # Add in filespace information, if present
39         for fs in filespaces.split(","):
40             if fs == "":
41                 continue
42             (fsoid, fselocation) = fs.split(":")
43             gpdb.addSegmentFilespace(fsoid, fselocation)
44         # Add Catalog Dir, if present
45         gpdb.catdirs = []
46         for d in catdirs.split(","):
47             if d == "":
48                 continue
49             gpdb.catdirs.append(d)
50         # Return the completed segment
51         return gpdb

Segment类

Segment类代表相同contentID的SegmentDBs,目前至多一个primary SegDB和单个mirror SegDB,在后续版本中会支持多mirror SegDB。

 1 class Segment:
 2     primaryDB=None #primary (GpDB实例)
 3     mirrorDBs =None
 4     def __init__(self):
 5         self.mirrorDBs = [] #mirror (GpDB实例)
 6         pass
 7     def addPrimary(self,segDB) #设置primary
 8     def addMirror(self,segDB) #追加mirror
 9     def get_dbs(self) #返回Primary和Mirror实例组成的列表(GpDB实例列表)
10     def get_hosts(self) #返回Primary和Mirror所在主机的主机名的列表
11     def is_segment_pair_valid(self):
12     """Validates that the primary/mirror pair are in a valid state"""
13         for mirror_db in self.mirrorDBs:
14             prim_status = self.primaryDB.getSegmentStatus()
15             prim_mode = self.primaryDB.getSegmentMode()
16             mirror_status = mirror_db.getSegmentStatus()
17             mirror_role = mirror_db.getSegmentMode()
18             if (prim_status, prim_mode, mirror_status, mirror_role) not in VALID_SEGMENT_STATES:
19                 return False
20         return True

primary和mirror对的合法状态如下,各个字段含义如下:primaryDB.getSegmentStatus、primaryDB.getSegmentMode、mirror_db.getSegmentStatus、mirror_db.getSegmentMode。
VALID_SEGMENT_STATES = [
(STATUS_UP, MODE_CHANGELOGGING, STATUS_DOWN, MODE_SYNCHRONIZED),
(STATUS_UP, MODE_CHANGELOGGING, STATUS_DOWN, MODE_RESYNCHRONIZATION),
(STATUS_UP, MODE_RESYNCHRONIZATION, STATUS_UP, MODE_RESYNCHRONIZATION),
(STATUS_UP, MODE_SYNCHRONIZED, STATUS_UP, MODE_SYNCHRONIZED)
]

    • primaryDB状态为up,模式为CHANGELOGGING,mirrorDB状态为down,模式可以为SYNCHRONIZED、RESYNCHRONIZATION
    • primaryDB状态为up,模式为RESYNCHRONIZATION,mirrorDB状态为up,模式为RESYNCHRONIZATION
    • primaryDB状态为up,模式为SYNCHRONIZED,mirrorDB状态为up,模式为SYNCHRONIZED
      如果要返回primaryDB的主机名,可使用segment1.primaryDB.getSegmentHostName()。

GpArray类

GpArray类构造函数接受包含QD和QE的GpDB的列表segments

 1 class GpArray:
 2     def __init__(self, segments, segmentsAsLoadedFromDb=None, strategyLoadedFromDb=None):
 3         self.master =None  #GpDB实例
 4         self.standbyMaster = None #GpDB实例
 5         self.segments = [] #Segment实例列表
 6         self.expansionSegments=[]
 7         self.numPrimarySegments = 0
 8         self.recoveredSegmentDbids = []
 9         self.__version = None
10         self.__segmentsAsLoadedFromDb = segmentsAsLoadedFromDb
11         self.__strategyLoadedFromDb = strategyLoadedFromDb
12         self.__strategy = FAULT_STRATEGY_NONE  # FAULT_STRATEGY_NONE = 'n'  # mirrorless systems  无mirror系统     
13         self.setFilespaces([])
14         for segdb in segments:
15             # Handle QD nodes   # 处理QD节点
16             if segdb.isSegmentMaster(True):
17                 if self.master != None:
18                     logger.error("multiple master dbs defined")
19                     raise Exception("GpArray - multiple master dbs defined")
20                 self.master = segdb
21             elif segdb.isSegmentStandby(True):
22                 if self.standbyMaster != None:
23                     logger.error("multiple standby master dbs defined")
24                     raise Exception("GpArray - multiple standby master dbs defined")
25                 self.standbyMaster = segdb
26             # Handle regular segments   # 处理QE节点
27             elif segdb.isSegmentQE():
28                 if segdb.isSegmentMirror():
29                     self.__strategy = FAULT_STRATEGY_FILE_REPLICATION  # FAULT_STRATEGY_FILE_REPLICATION = 'f'   # valid for versions 4.0+ # 有mirror节点
30                 self.addSegmentDb(segdb)
31             else:
32                 # Not a master, standbymaster, primary, or mirror?
33                 # shouldn't even be possible.
34                 logger.error("FATAL - invalid dbs defined")
35                 raise Exception("Error: GpArray() - invalid dbs defined")
36         # Make sure we have a master db
37         if self.master is None:
38             logger.error("FATAL - no master dbs defined!")
39             raise Exception("Error: GpArray() - no master dbs defined")  
40     def __str__(self):
41     def hasStandbyMaster(self):
42     def addSegmentDb(self, segdb): # segdb是GpDB实例,向self.segments中加入新的segment或向原有的segment对象添加GpDB实例(addPrimary或addMirror)
43     def isStandardArray(self):
44     def is_array_valid(self):
45     def dumpToFile(self, filename):
46     def setFaultStrategy(self, strategy):
47     def getFaultStrategy(self):
48     ....                  

initFromCatalog从数据库中获取GpArray对象的数据成员的数据,形参为数据库URL,设置utility模式。主要是一些查找数据库状态信息的SQL,作为DBA需要收集学习这些SQL,以备后续学习运维使用。

  1     @staticmethod
  2     def initFromCatalog(dbURL, utility=False):
  3         conn = dbconn.connect(dbURL, utility)
  4         # Get the version from the database:
  5         version_str = None
  6         for row in dbconn.execSQL(conn, "SELECT version()"):
  7             version_str = row[0]
  8         version = GpVersion(version_str)
  9         if version.getVersionRelease() in ("3.0", "3.1", "3.2", "3.3"):
 10             # In older releases we get the fault strategy using the
 11             # gp_fault_action guc.
 12             strategy_rows = dbconn.execSQL(conn, "show gp_fault_action")
 13             # Note: Mode may not be "right", certainly 4.0 concepts of mirroring
 14             # mode do not apply to 3.x, so it depends on how the scripts are
 15             # making use of mode.  For now it is initialized to synchronized.
 16             #
 17             # Note: hostname is initialized to null since the catalog does not
 18             # contain this information.  Initializing a hostcache using the
 19             # resulting gparray will automatically fill in a value for hostname.
 20             #
 21             # Note: this should be kept in sync with the code in
 22             # GpDB.InitFromString() code for initializing old catalog formats.
 23             config_rows = dbconn.execSQL(conn, '''
 24                 SELECT dbid, content,case when isprimary then 'p' else 'm' end as role,
 25                        case when definedprimary then 'p' else 'm' end as preferred_role,
 26                        's' as mode,case when valid then 'u' else 'd' end as status,
 27                        null as hostname,hostname as address,port,null as replication_port,
 28                        %s as fsoid,datadir as fselocation FROM pg_catalog.gp_configuration
 29                 ORDER BY content, preferred_role DESC
 30             ''' % str(SYSTEM_FILESPACE))
 31             # no filespace support in older releases.
 32             filespaceArr = []
 33         else:
 34             strategy_rows = dbconn.execSQL(conn, '''
 35                 SELECT fault_strategy FROM gp_fault_strategy
 36             ''')
 37             config_rows = dbconn.execSQL(conn, '''
 38                 SELECT dbid, content, role, preferred_role, mode, status,
 39                        hostname, address, port, replication_port, fs.oid,
 40                        fselocation
 41                 FROM pg_catalog.gp_segment_configuration
 42                 JOIN pg_catalog.pg_filespace_entry on (dbid = fsedbid)
 43                 JOIN pg_catalog.pg_filespace fs on (fsefsoid = fs.oid)
 44                 ORDER BY content, preferred_role DESC, fs.oid
 45             ''')
 46             filespaceRows = dbconn.execSQL(conn, '''
 47                 SELECT oid, fsname FROM pg_filespace ORDER BY fsname;
 48             ''')
 49             filespaceArr = [GpFilespaceObj(fsRow[0], fsRow[1]) for fsRow in filespaceRows]
 50         # Todo: add checks that all segments should have the same filespaces?
 51         recoveredSegmentDbids = []
 52         segments = []
 53         seg = None
 54         for row in config_rows:
 55             # Extract fields from the row
 56             (dbid, content, role, preferred_role, mode, status, hostname,
 57              address, port, replicationPort, fsoid, fslocation) = row
 58             # If we have segments which have recovered, record them.
 59             if preferred_role != role and content >= 0:
 60                 if mode == MODE_SYNCHRONIZED and status == STATUS_UP:
 61                     recoveredSegmentDbids.append(dbid)
 62             # The query returns all the filespaces for a segment on separate
 63             # rows.  If this row is the same dbid as the previous row simply
 64             # add this filespace to the existing list, otherwise create a
 65             # new segment.
 66             if seg and seg.getSegmentDbId() == dbid:
 67                 seg.addSegmentFilespace(fsoid, fslocation)
 68             else:
 69                 seg = GpDB(content, preferred_role, dbid, role, mode, status,
 70                            hostname, address, port, fslocation, replicationPort)
 71                 segments.append(seg)
 72         datcatloc = dbconn.execSQL(conn, '''
 73             select fsloc.dbid, fsloc.fselocation || '/' || case when db.dattablespace = 1663
 74                       then 'base' else db.dattablespace::text end || '/'||db.oid as catloc
 75             from pg_Database db, pg_tablespace ts,
 76                  (SELECT dbid, fs.oid, fselocation
 77                   FROM pg_catalog.gp_segment_configuration
 78                   JOIN pg_catalog.pg_filespace_entry on (dbid = fsedbid)
 79                   JOIN pg_catalog.pg_filespace fs on (fsefsoid = fs.oid)) fsloc
 80                   where db.dattablespace = ts.oid
 81                   and ts.spcfsoid = fsloc.oid''')
 82         conn.close()
 83         catlocmap = {}
 84         for row in datcatloc:
 85             if catlocmap.has_key(row[0]):
 86                 catlocmap[row[0]].append(row[1])
 87             else:
 88                 catlocmap[row[0]] = [row[1]]
 89         for seg in segments:
 90             seg.catdirs = catlocmap[seg.dbid]
 91         origSegments = [seg.copy() for seg in segments]
 92         if strategy_rows.rowcount == 0:
 93             raise Exception("Database does not contain gp_fault_strategy entry")
 94         if strategy_rows.rowcount > 1:
 95             raise Exception("Database has too many gp_fault_strategy entries")
 96         strategy = strategy_rows.fetchone()[0]
 97         array = GpArray(segments, origSegments, strategy)
 98         array.__version = version
 99         array.recoveredSegmentDbids = recoveredSegmentDbids
100         array.setFaultStrategy(strategy) # override the preliminary default `__strategy` with the database state, if available
101         array.setFilespaces(filespaceArr)
102         return array

initFromFile函数从文件中读取GpArray的信息,通过GpDB的initFromString函数,并使用GpArray构造函数创建GpArray对象。

1     @staticmethod
2     def initFromFile(filename):
3         segdbs=[]
4         fp = open(filename, 'r')
5         for line in fp:
6             segdbs.append(GpDB.initFromString(line))
7         fp.close()
8         return GpArray(segdbs)

使用

通过gppylib的system文件夹下提供的configurationInterface接口,注册配置Provider,并初始化Provider,通过调用loadSystemConfig函数加载GpArray对象。get_gparray_from_config函数返回GpArray对象。

 1 def get_gparray_from_config():
 2     # imports below, when moved to the top, seem to cause an import error in a unit test because of dependency issue
 3     from gppylib.system import configurationInterface
 4     from gppylib.system import configurationImplGpdb
 5     from gppylib.system.environment import GpMasterEnvironment
 6     master_data_dir = os.environ['MASTER_DATA_DIRECTORY']
 7     gpEnv = GpMasterEnvironment(master_data_dir, False)
 8     configurationInterface.registerConfigurationProvider(configurationImplGpdb.GpConfigurationProviderUsingGpdbCatalog())
 9     confProvider = configurationInterface.getConfigurationProvider().initializeProvider(gpEnv.getMasterPort())
10     return confProvider.loadSystemConfig(useUtilityMode=True)

代码来自于greenplum-db-5.27.1源代码