Interview Questions
CHAPTER 6: INTERVIEW QUESTIONS WITH SOLUTIONS
🧠 Yeh questions 90% interviews mein poochhe jaate hain. Har ek ka code + theory DONO yaad rakho. "Approach batao" bole toh theory batao, "Code likho" bole toh code likho.
Q1. How do you detect and handle outliers?
🧠 Outlier kya hai? Wo data point jo baaki sab se bahut alag ho. Jaise class mein sabke marks 50-80 hain, ek bande ke 2 hain — woh outlier hai. IQR method se pehchaante hain.
def detect_outliers_iqr(df, column):
"""Detect outliers using the IQR (Interquartile Range) method."""
Q1 = df[column].quantile(0.25) # 25th percentile
Q3 = df[column].quantile(0.75) # 75th percentile
IQR = Q3 - Q1 # Interquartile Range
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Identify outlier rows
outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
print(f"Column: {column}")
print(f" Q1={Q1}, Q3={Q3}, IQR={IQR}")
print(f" Valid range: [{lower_bound}, {upper_bound}]")
print(f" Outliers found: {len(outliers)}")
return outliers
# Detect
outliers = detect_outliers_iqr(df, 'revenue')
# Option 1: Remove outliers
df_clean = df[~df.index.isin(outliers.index)]
# Option 2: Cap outliers (winsorization)
Q1 = df['revenue'].quantile(0.25)
Q3 = df['revenue'].quantile(0.75)
IQR = Q3 - Q1
df['revenue_capped'] = df['revenue'].clip(lower=Q1-1.5*IQR, upper=Q3+1.5*IQR)
Theory: An outlier is a data point that is significantly different from other observations. The IQR method defines outliers as values below Q1 - 1.5×IQR or above Q3 + 1.5×IQR. This is the same logic behind box plot whiskers.
Q2. What is the difference between apply() and map()?
| Method | Works On | Use Case |
|---|---|---|
map() | Series only | Simple element-wise transformation |
apply() | Series or DataFrame | Complex functions, can access entire row |
applymap() | DataFrame only | Element-wise on entire DataFrame |
# map — simple substitution
df['segment_code'] = df['segment'].map({
'Corporate': 'C',
'Consumer': 'N',
'Home Office': 'H'
})
# apply on Series — custom function
df['revenue_log'] = df['revenue'].apply(lambda x: np.log(x) if x > 0 else 0)
# apply on DataFrame — access entire row
df['summary'] = df.apply(
lambda row: f"{row['name']} from {row['city']} spent ₹{row['revenue']}",
axis=1 # axis=1 means apply to each ROW
)
Q3. How do you read a large CSV file that doesn't fit in memory?
# Method 1: Read in chunks
chunk_size = 10000
results = []
for chunk in pd.read_csv('huge_file.csv', chunksize=chunk_size):
# Process each chunk
filtered = chunk[chunk['amount'] > 1000]
results.append(filtered)
df = pd.concat(results)
# Method 2: Read only needed columns
df = pd.read_csv('huge_file.csv', usecols=['customer_id', 'amount', 'date'])
# Method 3: Specify data types to reduce memory
df = pd.read_csv('huge_file.csv', dtype={
'customer_id': 'int32', # Instead of int64
'amount': 'float32', # Instead of float64
'category': 'category' # For low-cardinality strings
})
Q4. Find the top 3 customers by revenue in each city.
# Method: rank within groups
df['rank'] = df.groupby('city')['revenue'].rank(method='dense', ascending=False)
top3 = df[df['rank'] <= 3].sort_values(['city', 'rank'])
# OR using nlargest within groups
top3 = df.groupby('city').apply(
lambda x: x.nlargest(3, 'revenue')
).reset_index(drop=True)
Q5. Calculate month-over-month revenue growth.
monthly = df.groupby(df['order_date'].dt.to_period('M'))['revenue'].sum()
monthly_growth = monthly.pct_change() * 100 # percentage change
print(monthly_growth.round(2))
Q6. Write a function to generate a summary report for any DataFrame.
def data_summary(df):
"""Generate a comprehensive summary of any DataFrame."""
print("=" * 60)
print("DATASET SUMMARY REPORT")
print("=" * 60)
print(f"\n📊 Shape: {df.shape[0]} rows × {df.shape[1]} columns")
print(f"\n📋 Column Types:")
for dtype in df.dtypes.unique():
cols = df.select_dtypes(include=[dtype]).columns.tolist()
print(f" {dtype}: {len(cols)} columns → {cols}")
print(f"\n❌ Missing Values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
for col in df.columns:
if missing[col] > 0:
print(f" {col}: {missing[col]} ({missing_pct[col]}%)")
if missing.sum() == 0:
print(" None! ✅")
print(f"\n🔢 Numerical Summary:")
print(df.describe().round(2))
print(f"\n📝 Categorical Summary:")
for col in df.select_dtypes(include='object').columns:
print(f" {col}: {df[col].nunique()} unique values, "
f"top = '{df[col].mode()[0]}'")
print(f"\n🔁 Duplicates: {df.duplicated().sum()}")
# Usage
data_summary(df)
CHAPTER 7: COMMON PYTHON INTERVIEW CODE SNIPPETS
Reverse a string
s = "DecisionTree"
s[::-1] # "eerTnoisiceD"
Check if a string is a palindrome
def is_palindrome(s):
s = s.lower().replace(" ", "")
return s == s[::-1]
is_palindrome("madam") # True
Find the most frequent element in a list
from collections import Counter
nums = [1, 3, 2, 1, 4, 1, 3, 3, 3]
Counter(nums).most_common(1) # [(3, 4)] → 3 appears 4 times
Remove duplicates from a list while preserving order
def remove_dupes(lst):
seen = set()
result = []
for item in lst:
if item not in seen:
seen.add(item)
result.append(item)
return result
remove_dupes([1, 3, 2, 1, 4, 3]) # [1, 3, 2, 4]
Flatten a nested list
nested = [[1, 2], [3, 4], [5, 6]]
flat = [item for sublist in nested for item in sublist]
# [1, 2, 3, 4, 5, 6]
Practice: Work through Kaggle's Pandas exercises and try the Titanic dataset for a real EDA practice run. Aim to complete 3–5 EDA projects before the interview.