pandas dataframe 过滤——apply最灵活！！！

按照某特定string字段长度过滤：

import pandas as pd

df = pd.read_csv('filex.csv')

df['A'] = df['A'].astype('str')

df['B'] = df['B'].astype('str')

mask = (df['A'].str.len() == 10) & (df['B'].str.len() == 10)

df = df.loc[mask]

print(df)

Applied to filex.csv:

A,B

123,abc

1234,abcd

1234567890,abcdefghij

the code above prints

            A           B

2  1234567890  abcdefghij

或者是：

data={"names":["Alice","Zac","Anna","O"],"cars":["Civic","BMW","Mitsubishi","Benz"],

     "age":["1","4","2","0"]}

df=pd.DataFrame(data)

"""

df:

  age        cars  names

0   1       Civic  Alice

1   4         BMW    Zac

2   2  Mitsubishi   Anna

3   0        Benz      O

Then:

"""

df[

df['names'].apply(lambda x: len(x)>1) &

df['cars'].apply(lambda x: "i" in x) &

df['age'].apply(lambda x: int(x)<2)

  ]

"""

We will have :

  age   cars  names

0   1  Civic  Alice

"""

最灵活的是用apply：

def load_metadata(dir_name):

    columns_index_list = [

        MetaIndex.M_METADATA_ID_INDEX,

        MetaIndex.M_SRC_IP_INDEX,

        MetaIndex.M_DST_IP_INDEX,

        MetaIndex.M_SRC_PORT_INDEX,

        MetaIndex.M_DST_PORT_INDEX,

        MetaIndex.M_PROTOCOL_INDEX,

        MetaIndex.M_HEADER_H,

        MetaIndex.M_PAYLOAD_H,

        MetaIndex.M_TCP_FLAG_H,

        MetaIndex.M_FLOW_FIRST_PKT_TIME,

        MetaIndex.M_FLOW_LAST_PKT_TIME,

        MetaIndex.M_OCTET_DELTA_COUNT_FROM_TOTAL_LEN,

    ]

    columns_name_list = [

        "M_METADATA_ID_INDEX",

        "M_SRC_IP_INDEX",

        "M_DST_IP_INDEX",

        "M_SRC_PORT_INDEX",

        "M_DST_PORT_INDEX",

        "M_PROTOCOL_INDEX",

        "M_HEADER_H",

        "M_PAYLOAD_H",

        "M_TCP_FLAG_H",

        "M_FLOW_FIRST_PKT_TIME",

        "M_FLOW_LAST_PKT_TIME",

        "M_OCTET_DELTA_COUNT_FROM_TOTAL_LEN",

    ]

    def metadata_parse_filter(row):

        try:

            if row['M_PROTOCOL_INDEX'] != 6:

                return False

            if len(row['M_HEADER_H']) < 2 or len(row['M_PAYLOAD_H']) < 2 or not is_l34_tcp_metadata(row['M_METADATA_ID_INDEX']):

                return False

            first_time = row['M_FLOW_FIRST_PKT_TIME'].split('-')

            last_time = row['M_FLOW_LAST_PKT_TIME'].split('-')

            flow_first_pkt_time = int(first_time[0])

            rev_flow_first_pkt_time = int(first_time[1])

            flow_last_pkt_time = int(last_time[0])

            rev_flow_last_pkt_time = int(last_time[1])

            if flow_first_pkt_time > flow_last_pkt_time or rev_flow_first_pkt_time > rev_flow_last_pkt_time:

                return False

            return True

        except Exception as e:

            return False

    for root, dirs, files in os.walk(dir_name):

        for filename in files:

            file_path = os.path.join(root, filename)

            df = pd.read_csv(file_path, delimiter='^', usecols=columns_index_list, names=columns_name_list, encoding='utf-8', error_bad_lines=False, warn_bad_lines=True, header=0, lineterminator="\n")

            filter_df = df.loc[df.apply(metadata_parse_filter, axis=1)]

            yield filter_df

　直接按照row过滤！　

秒客网

pandas dataframe 过滤——apply最灵活！！！

相关文章